Spaces:
Runtime error
Runtime error
File size: 2,305 Bytes
e05a89c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
import time
import requests
from bs4 import BeautifulSoup
import re
from markdownify import markdownify as md
import pandas as pd
import argparse
def extract_content(url: str):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
page_subject = soup.select_one("#load_content .page-subject")
page_content = soup.select_one("#load_content .page-content")
markdown_content = md(
str(page_subject) + str(page_content),
heading_style="ATX",
bullets="-",
strong_em_symbol="*",
code_language="python",
escape_asterisks=False,
escape_underscores=False,
)
normalized_text = re.sub(r"\n{2}", "\n", markdown_content)
return normalized_text
def main(ebook_url):
base_url = "https://wikidocs.net"
# book_id ์ถ์ถ
book_id = ebook_url.split("/")[-1]
# ํ์ด์ง ์์ค ๊ฐ์ ธ์ค๊ธฐ
response = requests.get(ebook_url)
response.raise_for_status() # ์์ธ ์ฒ๋ฆฌ
soup = BeautifulSoup(response.content, "html.parser")
# ๋ชฉ์ฐจ์์ 'a' ํ๊ทธ๋ง ๊ฐ์ ธ์ค๊ธฐ
toc = soup.select(".list-group-toc a[href^='javascript:page(']")
# ์ถ์ถํ ๋ฐ์ดํฐ ์ ์ฅํ ๋ฆฌ์คํธ
data_list = []
for item in toc:
title = item.get_text(strip=True)
page_id = item.get("href").split("page(")[-1].rstrip(")")
link = f"{base_url}/{page_id}"
data_list.append({"title": title, "link": link})
# ๋ฐ์ดํฐ ๋ฆฌ์คํธ๋ฅผ ์ํํ๋ฉฐ ์ฝํ
์ธ ์ถ์ถ
for item in data_list[1:]:
item["content"] = extract_content(item["link"])
time.sleep(1) # ํ์ด์ง ๋ก๋๋ฅผ ์ํด ๋๊ธฐ
# ๋ฐ์ดํฐํ๋ ์์ผ๋ก ๋ณํ
df = pd.DataFrame(data_list)
df = df.dropna(subset=["content"])
# ๋ฐ์ดํฐํ๋ ์์ parquet ํ์ผ๋ก ์ ์ฅ
parquet_filename = f"wikidocs_{book_id}.parquet"
df.to_parquet(parquet_filename, index=False)
print(f"ํ์ผ์ด ์ฑ๊ณต์ ์ผ๋ก ์ ์ฅ๋์์ต๋๋ค: {parquet_filename}")
if __name__ == "__main__":
# ๋ช
๋ น์ด ์ค ์ธ์ ์ฒ๋ฆฌ
parser = argparse.ArgumentParser(description="Wikidocs ebook URL์ ์
๋ ฅํ์ธ์.")
parser.add_argument("ebook_url", type=str, help="Wikidocs ebook URL")
args = parser.parse_args()
main(args.ebook_url)
|