import requests, re from bs4 import BeautifulSoup article_url = 'https://www.jbpresscenter.com/news/articleView.html?idxno=' def scrape_page(url): response = requests.get(url) soup = BeautifulSoup(response.content, "html.parser") text = soup.get_text('a') text = text.strip() text = text.replace("\n", "") start_index = text.find("닫기") end_index = text.find("jbnu.ac.kr") text = text[start_index+15:end_index-25] return text def has_korean_word(text): pattern = re.compile("[\u3131-\u3163\uac00-\ud7a3]+") match = re.search(pattern, text) if match: return True else: return False def scrape_recursive(url, article_ids): for k in article_ids: text = scrape_page(url+k) if text is not None: print("working on file", k) with open(f'clean_data/NEWS/output{k}.txt', 'w', encoding='utf-8') as f: f.write(text) print("Files are written") def scrape_id(url): response = requests.get(url) if response.status_code == 200: soup = BeautifulSoup(response.content, "html.parser") links = soup.find_all('a', target='_top') article_ids = set() for link in links: href = link.get('href') article_id = extract_article_id(href) if article_id and int(article_id) > 200000: response = requests.get(article_url+article_id) soup = BeautifulSoup(response.content, "html.parser") text = soup.get_text('a') text = text.strip() text = text.replace("\n", "") start_index = text.find("닫기") end_index = text.find("jbnu.ac.kr") text = text[start_index+15:end_index-25] if has_korean_word(text): pass else: article_ids.add(article_id) return article_ids else: print("Failed to retrieve the page:", response.status_code) def extract_article_id(href): pattern = r'\d+' # Assuming the article ID is a sequence of digits match = re.search(pattern, href) if match: return match.group(0) else: return None def scrape_recursive_pagination(start, end): for k in range(start, end+1): url = f"https://www.jbpresscenter.com/news/articleList.html?page={k}&box_idxno=&sc_sub_section_code=S2N18&view_type=sm" article_ids = scrape_id(url) print(article_ids, len(article_ids)) url = 'https://www.jbpresscenter.com/news/articleView.html?idxno=' scrape_recursive(url, article_ids) scrape_recursive_pagination(6, 39)