ceshidddyyy / crawl.py
xjf6b's picture
Create crawl.py
94be149 verified
raw
history blame
2.72 kB
import requests
import re
from bs4 import BeautifulSoup
import logging
import os
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
class AirPortCollector:
def __init__(self):
self.url = "https://t.me/s/jichang_list?before=457"
self.airports = []
self.proxy = os.getenv('PROXY')
def fetch_content(self):
try:
proxies = {'http': self.proxy, 'https': self.proxy} if self.proxy else None
response = requests.get(self.url, verify=False, proxies=proxies)
response.raise_for_status()
return response.text
except requests.RequestException as e:
logging.error(f"Error fetching content: {e}")
return None
def parse_content(self, content):
if not content:
return
soup = BeautifulSoup(content, 'html.parser')
messages = soup.find_all('div', class_='tgme_widget_message_text')
for message in messages:
airport = {}
text = message.get_text()
# Extract airport name
name_match = re.search(r'⦁ 名称:\s*(.*)', text)
if name_match:
airport['name'] = name_match.group(1).strip()
# Extract official website
website_matches = re.findall(r'⦁ 官网:\s*(https?://\S+)', text)
if website_matches:
airport['websites'] = website_matches
# Extract Telegram channel
channel_match = re.search(r'⦁ 频道:\s*(@\S+)', text)
if channel_match:
airport['channel'] = channel_match.group(1)
# Extract Telegram group
group_match = re.search(r'⦁ 群组:\s*(@\S+)', text)
if group_match:
airport['group'] = group_match.group(1)
if airport:
self.airports.append(airport)
def collect(self):
content = self.fetch_content()
if content:
self.parse_content(content)
logging.info(f"Collected {len(self.airports)} airports")
else:
logging.warning("Failed to fetch content")
def get_airports(self):
return self.airports
def main():
collector = AirPortCollector()
collector.collect()
airports = collector.get_airports()
# Write to file
with open('/app/subscribes.txt', 'w') as f:
for airport in airports:
if 'websites' in airport:
for website in airport['websites']:
f.write(f"{website}\n")
logging.info(f"Wrote {len(airports)} airport websites to subscribes.txt")
if __name__ == "__main__":
main()