PdfSumGPT / utils /read_web.py
Qifan Zhang
add read web, text; update truncation
f89a7d8
raw
history blame contribute delete
386 Bytes
import re
import requests
from bs4 import BeautifulSoup
def read_web(url: str) -> str:
if not url:
return ''
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'html.parser')
text = soup.get_text()
text = re.sub('\n{3,}', '\n\n', text)
return text
if __name__ == '__main__':
r = read_web('https://en.wikipedia.org/wiki/Wiki')
print(r)