Spaces:

xjf6b
/

ceshidddyyy

Running

App Files Files Community

ceshidddyyy / crawl.py

xjf6b

Create crawl.py

94be149 verified 6 months ago

raw

history blame

2.72 kB

	import requests
	import re
	from bs4 import BeautifulSoup
	import logging
	import os

	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

	class AirPortCollector:
	def __init__(self):
	self.url = "https://t.me/s/jichang_list?before=457"
	self.airports = []
	self.proxy = os.getenv('PROXY')

	def fetch_content(self):
	try:
	proxies = {'http': self.proxy, 'https': self.proxy} if self.proxy else None
	response = requests.get(self.url, verify=False, proxies=proxies)
	response.raise_for_status()
	return response.text
	except requests.RequestException as e:
	logging.error(f"Error fetching content: {e}")
	return None

	def parse_content(self, content):
	if not content:
	return

	soup = BeautifulSoup(content, 'html.parser')
	messages = soup.find_all('div', class_='tgme_widget_message_text')

	for message in messages:
	airport = {}
	text = message.get_text()

	# Extract airport name
	name_match = re.search(r'⦁ 名称:\s(.)', text)
	if name_match:
	airport['name'] = name_match.group(1).strip()

	# Extract official website
	website_matches = re.findall(r'⦁ 官网:\s*(https?://\S+)', text)
	if website_matches:
	airport['websites'] = website_matches

	# Extract Telegram channel
	channel_match = re.search(r'⦁ 频道:\s*(@\S+)', text)
	if channel_match:
	airport['channel'] = channel_match.group(1)

	# Extract Telegram group
	group_match = re.search(r'⦁ 群组:\s*(@\S+)', text)
	if group_match:
	airport['group'] = group_match.group(1)

	if airport:
	self.airports.append(airport)

	def collect(self):
	content = self.fetch_content()
	if content:
	self.parse_content(content)
	logging.info(f"Collected {len(self.airports)} airports")
	else:
	logging.warning("Failed to fetch content")

	def get_airports(self):
	return self.airports

	def main():
	collector = AirPortCollector()
	collector.collect()
	airports = collector.get_airports()

	# Write to file
	with open('/app/subscribes.txt', 'w') as f:
	for airport in airports:
	if 'websites' in airport:
	for website in airport['websites']:
	f.write(f"{website}\n")

	logging.info(f"Wrote {len(airports)} airport websites to subscribes.txt")

	if __name__ == "__main__":
	main()