Yoon-gu Hwang commited on
Commit
59b8c1b
1 Parent(s): 7a35858

포켓몬 진화 데이터 추가

Browse files
Files changed (2) hide show
  1. make_evolve_dataset.py +43 -0
  2. pokemon_evolve.json +0 -0
make_evolve_dataset.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import urllib.request
2
+ import re
3
+ import json
4
+ import urllib.parse
5
+ from urllib.parse import urlsplit, quote
6
+ from urllib.request import Request, urlopen
7
+ from bs4 import BeautifulSoup
8
+ from tqdm import tqdm
9
+ import pandas as pd
10
+
11
+ url = 'https://pokemon.fandom.com/ko/wiki/이상해씨_(포켓몬)'
12
+
13
+ url_info = urlsplit(url)
14
+ encoded_url = f'{url_info.scheme}://{url_info.netloc}{quote(url_info.path)}'
15
+
16
+ info = []
17
+ erros = []
18
+ target_number = 1017
19
+ cnt = 0
20
+ for _ in tqdm(range(target_number+2)):
21
+ cnt += 1
22
+ req = Request(encoded_url, headers={'User-Agent': 'Mozilla/5.0'})
23
+ res = urlopen(req)
24
+ html = res.read()
25
+ soup = BeautifulSoup(html, 'html.parser')
26
+
27
+ name = soup.find("div", {"class": "name-ko"}).text.strip()
28
+ number = soup.find("div", {"class": "index"}).text.strip()
29
+ doc_text = '\n'.join([p.text.replace('\n', '').strip() for p in soup.find_all("p")])
30
+ types = [poke_type['title'].split(' ')[0].strip() for poke_type in soup.select('tbody > tr > td > div')[0].select('span > a')]
31
+
32
+ evol_tables = soup.find("table", style=re.compile("^margin:auto; text-align:center;"))
33
+ info.append(dict(name=name, evolve=[e.span.text for e in evol_tables.find_all("table")]))
34
+ next_monster = soup.find("table").findAll("a")[-1]['href']
35
+ encoded_url = "https://pokemon.fandom.com" + next_monster
36
+ if number == f"No.{target_number:04d}":
37
+ break
38
+
39
+ if cnt >= target_number:
40
+ break
41
+
42
+ with open('pokemon_evolve.json', 'w') as f:
43
+ json.dump(info, f, ensure_ascii=False, indent=4)
pokemon_evolve.json ADDED
The diff for this file is too large to render. See raw diff