acecalisto3 commited on
Commit
68b30df
·
verified ·
1 Parent(s): 52c508e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -69
app.py CHANGED
@@ -12,6 +12,7 @@ import zipfile
12
  import os
13
  import datetime
14
  from urllib.parse import urlparse
 
15
 
16
  # Configure logging
17
  logging.basicConfig(level=logging.INFO,
@@ -253,56 +254,109 @@ def process_urls(url_input, bulk_toggle, action_radio, max_urls, crawl_depth, mo
253
  def recognize_intent(instruction: str) -> str:
254
  instruction = instruction.lower()
255
 
256
- # Patterns for counting images
257
- if re.search(r'\b(count the images|how many images|total images|image count)', instruction):
258
- return "count_images"
 
 
 
 
259
 
260
- # Patterns for listing links
261
- elif re.search(r'\b(list all links|find hyperlinks|show me urls|extract links)', instruction):
262
- return "scrape_links"
263
-
264
- # Patterns for monitoring changes
265
- elif re.search(r'\b(monitor changes|watch for updates|detect changes|track updates)', instruction):
266
- return "monitor_changes"
267
-
268
- else:
269
- return "unknown"
270
 
271
- def generate_command(intent: str, url_input: str, bulk_toggle: bool, max_urls: int, crawl_depth: int) -> str:
272
- urls = re.split(r'[,\n]+', url_input.strip()) if bulk_toggle else [url_input]
273
- urls = [url.strip() for url in urls if url.strip()]
274
- urls = urls[:max_urls]
 
 
 
 
 
 
 
275
 
276
- if intent == "scrape_links":
277
- all_links = []
278
- for url in urls:
279
- links = extract_links_from_page(url)
280
- all_links.extend(links)
281
- return f"Extracted links: {', '.join(all_links)}"
282
-
283
- elif intent == "count_images":
284
- total_images = 0
285
- for url in urls:
286
- response = requests.get(url, timeout=10)
287
- soup = BeautifulSoup(response.text, 'html.parser')
288
- images = soup.find_all('img')
289
- total_images += len(images)
290
- return f"There are {total_images} images across the specified URLs."
291
-
 
 
 
292
  elif intent == "monitor_changes":
293
- changes_log = process_urls(url_input, bulk_toggle, "Scrape data", max_urls, crawl_depth, mode='chat')
294
  return changes_log
295
-
296
- return "Instruction not recognized. Please try again."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
 
298
- def chat_based_scrape(instruction, url_input, bulk_toggle, max_urls, crawl_depth):
299
- print(f"Received instruction: {instruction}")
300
- # Recognize intent
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
301
  intent = recognize_intent(instruction)
302
- print(f"Recognized intent: {intent}")
303
 
304
  # Generate command based on the recognized intent
305
- command_output = generate_command(intent, url_input, bulk_toggle, max_urls, crawl_depth)
306
 
307
  return command_output
308
 
@@ -371,44 +425,28 @@ def create_interface():
371
  )
372
 
373
  with gr.Tab("Chat-Based Scrape"):
374
- chat_instruction = gr.Textbox(
375
  label="Enter Instruction",
376
  placeholder="e.g., 'Scrape all links' or 'Extract all images'"
377
  )
378
- chat_url_input = gr.Textbox(
379
- label="Enter URL(s)",
380
  value="https://example.com",
381
- placeholder="Enter single URL or multiple URLs separated by commas"
382
- )
383
- chat_bulk_toggle = gr.Checkbox(label="Bulk URLs", value=False)
384
- chat_max_urls = gr.Slider(
385
- minimum=1,
386
- maximum=20,
387
- value=5,
388
- step=1,
389
- label="Max URLs to process"
390
  )
391
- chat_crawl_depth = gr.Slider(
392
- minimum=1,
393
- maximum=3,
394
- value=1,
395
- step=1,
396
- label="Crawl Depth"
397
  )
398
- chat_output = gr.Textbox(label="Chat Output")
399
 
400
- chat_button = gr.Button("Submit Instruction", variant="primary")
401
 
402
  chat_button.click(
403
  fn=chat_based_scrape,
404
- inputs=[
405
- chat_instruction,
406
- chat_url_input,
407
- chat_bulk_toggle,
408
- chat_max_urls,
409
- chat_crawl_depth
410
- ],
411
- outputs=chat_output
412
  )
413
 
414
  gr.Markdown(
 
12
  import os
13
  import datetime
14
  from urllib.parse import urlparse
15
+ import tempfile
16
 
17
  # Configure logging
18
  logging.basicConfig(level=logging.INFO,
 
254
  def recognize_intent(instruction: str) -> str:
255
  instruction = instruction.lower()
256
 
257
+ # General patterns for actions and data types
258
+ action_patterns = {
259
+ r'\b(find|extract|scrape)\s+(links|images|videos|texts|prices|product names|reviews)\b': 'extract_data',
260
+ r'\b(count)\s+(links|images|videos|products)\b': 'count_data',
261
+ r'\b(what is|get|fetch)\s+(channel name|subscriber count|viewers)\b': 'fetch_specific_data',
262
+ r'\b(monitor)\s+changes\b': 'monitor_changes',
263
+ }
264
 
265
+ for pattern, intent in action_patterns.items():
266
+ if re.search(pattern, instruction):
267
+ return intent
268
+ return "unknown"
 
 
 
 
 
 
269
 
270
+ def extract_data_type(instruction: str) -> str:
271
+ instruction = instruction.lower()
272
+ data_types = {
273
+ r'\b(links|images|videos|texts|prices|product names|reviews)\b': 'links',
274
+ r'\b(links|images|videos|products)\b': 'images',
275
+ r'\b(channel name|subscriber count|viewers)\b': 'channel name',
276
+ }
277
+ for pattern, data_type in data_types.items():
278
+ if re.search(pattern, instruction):
279
+ return data_type
280
+ return "unknown"
281
 
282
+ def format_output(data, output_format):
283
+ if output_format == "JSON":
284
+ return json.dumps(data, indent=2)
285
+ elif output_format == "Cleaned JSON":
286
+ # Implement data cleaning logic here
287
+ return json.dumps(data, indent=2)
288
+ else:
289
+ return str(data)
290
+
291
+ def generate_command(intent: str, url_input: str, data_type: str, output_format: str) -> str:
292
+ if intent == "extract_data":
293
+ data = extract_data(url_input, data_type)
294
+ return format_output(data, output_format)
295
+ elif intent == "count_data":
296
+ count = count_data(url_input, data_type)
297
+ return f"The number of {data_type} is {count}."
298
+ elif intent == "fetch_specific_data":
299
+ specific_data = fetch_specific_data(url_input, data_type)
300
+ return specific_data
301
  elif intent == "monitor_changes":
302
+ changes_log = monitor_changes(url_input)
303
  return changes_log
304
+ else:
305
+ return "Instruction not recognized. Please try again."
306
+
307
+ def extract_data(url, data_type):
308
+ try:
309
+ response = requests.get(url)
310
+ soup = BeautifulSoup(response.text, 'html.parser')
311
+
312
+ if data_type == "links":
313
+ return [a['href'] for a in soup.find_all('a', href=True)]
314
+ elif data_type == "images":
315
+ return [img['src'] for img in soup.find_all('img', src=True)]
316
+ # Add more data types as needed
317
+ else:
318
+ return []
319
+ except Exception as e:
320
+ return f"Error extracting {data_type}: {str(e)}"
321
 
322
+ def count_data(url, data_type):
323
+ try:
324
+ response = requests.get(url)
325
+ soup = BeautifulSoup(response.text, 'html.parser')
326
+
327
+ if data_type == "links":
328
+ return len(soup.find_all('a', href=True))
329
+ elif data_type == "images":
330
+ return len(soup.find_all('img', src=True))
331
+ # Add more data types as needed
332
+ else:
333
+ return 0
334
+ except Exception as e:
335
+ return f"Error counting {data_type}: {str(e)}"
336
+
337
+ def fetch_specific_data(url, data_type):
338
+ try:
339
+ # Implement specific data fetching logic here
340
+ # For demonstration, return a placeholder
341
+ return f"Fetched {data_type} from {url}"
342
+ except Exception as e:
343
+ return f"Error fetching {data_type}: {str(e)}"
344
+
345
+ def monitor_changes(url_input):
346
+ try:
347
+ # Implement change monitoring logic here
348
+ # For demonstration, return a placeholder
349
+ return f"Changes monitored for {url_input}"
350
+ except Exception as e:
351
+ return f"Error monitoring changes: {str(e)}"
352
+
353
+ def chat_based_scrape(instruction, url_input, output_format):
354
+ # Recognize intent and extract data type if applicable
355
  intent = recognize_intent(instruction)
356
+ data_type = extract_data_type(instruction)
357
 
358
  # Generate command based on the recognized intent
359
+ command_output = generate_command(intent, url_input, data_type, output_format)
360
 
361
  return command_output
362
 
 
425
  )
426
 
427
  with gr.Tab("Chat-Based Scrape"):
428
+ instruction = gr.Textbox(
429
  label="Enter Instruction",
430
  placeholder="e.g., 'Scrape all links' or 'Extract all images'"
431
  )
432
+ url_input = gr.Textbox(
433
+ label="Enter URL",
434
  value="https://example.com",
435
+ placeholder="Enter the target URL"
 
 
 
 
 
 
 
 
436
  )
437
+ output_format = gr.Radio(
438
+ ["JSON", "Cleaned JSON", "Raw Data"],
439
+ label="Output Format",
440
+ value="JSON"
 
 
441
  )
442
+ output = gr.Textbox(label="Output")
443
 
444
+ chat_button = gr.Button("Execute Instruction", variant="primary")
445
 
446
  chat_button.click(
447
  fn=chat_based_scrape,
448
+ inputs=[instruction, url_input, output_format],
449
+ outputs=output
 
 
 
 
 
 
450
  )
451
 
452
  gr.Markdown(