Spaces:
Runtime error
Runtime error
Yoon-gu Hwang
commited on
Commit
•
59b8c1b
1
Parent(s):
7a35858
포켓몬 진화 데이터 추가
Browse files- make_evolve_dataset.py +43 -0
- pokemon_evolve.json +0 -0
make_evolve_dataset.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import urllib.request
|
2 |
+
import re
|
3 |
+
import json
|
4 |
+
import urllib.parse
|
5 |
+
from urllib.parse import urlsplit, quote
|
6 |
+
from urllib.request import Request, urlopen
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
+
from tqdm import tqdm
|
9 |
+
import pandas as pd
|
10 |
+
|
11 |
+
url = 'https://pokemon.fandom.com/ko/wiki/이상해씨_(포켓몬)'
|
12 |
+
|
13 |
+
url_info = urlsplit(url)
|
14 |
+
encoded_url = f'{url_info.scheme}://{url_info.netloc}{quote(url_info.path)}'
|
15 |
+
|
16 |
+
info = []
|
17 |
+
erros = []
|
18 |
+
target_number = 1017
|
19 |
+
cnt = 0
|
20 |
+
for _ in tqdm(range(target_number+2)):
|
21 |
+
cnt += 1
|
22 |
+
req = Request(encoded_url, headers={'User-Agent': 'Mozilla/5.0'})
|
23 |
+
res = urlopen(req)
|
24 |
+
html = res.read()
|
25 |
+
soup = BeautifulSoup(html, 'html.parser')
|
26 |
+
|
27 |
+
name = soup.find("div", {"class": "name-ko"}).text.strip()
|
28 |
+
number = soup.find("div", {"class": "index"}).text.strip()
|
29 |
+
doc_text = '\n'.join([p.text.replace('\n', '').strip() for p in soup.find_all("p")])
|
30 |
+
types = [poke_type['title'].split(' ')[0].strip() for poke_type in soup.select('tbody > tr > td > div')[0].select('span > a')]
|
31 |
+
|
32 |
+
evol_tables = soup.find("table", style=re.compile("^margin:auto; text-align:center;"))
|
33 |
+
info.append(dict(name=name, evolve=[e.span.text for e in evol_tables.find_all("table")]))
|
34 |
+
next_monster = soup.find("table").findAll("a")[-1]['href']
|
35 |
+
encoded_url = "https://pokemon.fandom.com" + next_monster
|
36 |
+
if number == f"No.{target_number:04d}":
|
37 |
+
break
|
38 |
+
|
39 |
+
if cnt >= target_number:
|
40 |
+
break
|
41 |
+
|
42 |
+
with open('pokemon_evolve.json', 'w') as f:
|
43 |
+
json.dump(info, f, ensure_ascii=False, indent=4)
|
pokemon_evolve.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|