chatbot-team4 / utils /scrape_JBNU_FOCUS.py
Cyberspyde
Final Update
ce24d59
raw
history blame contribute delete
No virus
702 Bytes
import requests, re
from bs4 import BeautifulSoup
def scrape_page(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
text = soup.get_text()
text = text.strip()
text = text.replace("\n", "")
pattern = re.compile("[\u3131-\u3163\uac00-\ud7a3]+")
if text != "":
print(text)
return text
def scrape_recursive(url, output_file):
text = scrape_page(url)
if text is not None:
with open(output_file, "w", encoding='utf-8') as f:
f.write(text)
url = "https://www.jbnu.ac.kr/eng/?menuID=350&mode=view&no="
for k in range(1, 320):
scrape_recursive(url+str(k), "data/output{}.txt".format(k))