VITS-TTS-Japanese-Only-Sakura-Miko / ParseAmitaroHTML.py
Lycoris53's picture
Duplicate from Lycoris53/VITS-TTS-Japanese-Only-Amitaro
03204e8
pip --cert /etc/pki/ca-trust/source/anchors/tri-ace-CA-2015.cer install --trusted-host pypi.org --trusted-host files.pythonhosted.org beautifulsoup4
from bs4 import BeautifulSoup
f = open("./amitaro.htm", "r")
txt = f.read()
soup = BeautifulSoup(txt)
print(soup.prettify())
import json
f = open('amitaro.json')
file_list = json.load(f)
td = soup.find_all('td')
for i, val in enumerate(td):
if len(val.contents) == 0:
continue
key = val.contents[0]
if key in file_list:
#print(td[i-1].contents[0])
if len(td[i-1].contents) > 0:
#print(td[i-1].contents[0])
temp = BeautifulSoup(str(td[i-1].contents[0]))
a = temp.find_all('a')
print(a[0].contents[0])
file_list[key]["kana"] = str(a[0].contents[0])
with open("./amitaro_with_kana.json", "w") as outfile:
outfile.write(json.dumps(file_list, indent=4,ensure_ascii=False))
for key, val in file_list.items():
val["path"] = "./data_amitaro22k/" + val["path"]
with open("./amitaro_with_kana.json", "w") as outfile:
outfile.write(json.dumps(file_list, indent=4,ensure_ascii=False))
file = []
for key, val in file_list.items():
if len(val['kana']) == 0:
continue
if val['kana'].find("(") != -1:
continue
file.append(f"{val['path']}|10|{val['kana']}")
amitaro_train = []
amitaro_val = []
for val in file:
amitaro_train.append(val)
import random
rands = []
while len(rands) < len(file)/10:
rand_num = random.randint(0, len(file)-1)
if rand_num in rands:
continue
amitaro_val.append(file[rand_num])
rands.append(rand_num)
f = open("amitaro_train.txt", "w")
for val in amitaro_train:
f.write(f"{val}\n")
f.close()
f = open("amitaro_val.txt", "w")
for val in amitaro_val:
f.write(f"{val}\n")
f.close()