tts / app.py
yesyesjaewook's picture
Update TTS
00c7109
import re
from datetime import datetime
from typing import Tuple
import gradio as gr
import numpy as np
import torch
from espnet2.bin.tts_inference import Text2Speech
model = Text2Speech.from_pretrained("yesyesjaewook/jets-jaewook-ko")
emotions = {
"κ°€μ—Ύλ‹€": 1,
"κ°‘κ°‘ν•˜λ‹€": 2,
"κ°‘μž‘μŠ€λŸ½λ‹€": 3,
"κ²½λ©ΈμŠ€λŸ½λ‹€": 4,
"고맙닀": 5,
"κ³ ν†΅μŠ€λŸ½λ‹€": 6,
"κ³€λž€ν•˜λ‹€": 7,
"κ΄΄λ‘­λ‹€": 8,
"ꡉμž₯ν•˜λ‹€": 9,
"그립닀": 10,
"κΈ‰ν•˜λ‹€": 11,
"κΈ°λ§‰νžˆλ‹€": 12,
"κΈ΄λ°€ν•˜λ‹€": 13,
"κΊΌλ¦ΌμΉ™ν•˜λ‹€": 14,
"λ”μ°ν•˜λ‹€": 15,
"λ‚œλ°μ—†λ‹€": 16,
"λ‚¨μ‚¬μŠ€λŸ½λ‹€": 17,
"λƒ‰μ •ν•˜λ‹€": 18,
"λ…Ήλ…Ήν•˜λ‹€": 19,
"λ‹€κΈ‰ν•˜λ‹€": 20,
"λ‹΄λ°±ν•˜λ‹€": 21,
"λŒ€λ™μ†Œμ΄ν•˜λ‹€": 22,
"두렡닀": 23,
"λ‘₯κ·ΈμŠ€λ¦„ν•˜λ‹€": 24,
"λšœλ ·ν•˜λ‹€": 25,
"λ§Œμ‘±μŠ€λŸ½λ‹€": 26,
"λ§ν•˜λ‹€": 27,
"λ§€μ •ν•˜λ‹€": 28,
"λͺ»λ˜λ‹€": 29,
"λ¬΄κ΄€μ‹¬ν•˜λ‹€": 30,
"λ¬΄λ‘€ν•˜λ‹€": 31,
"λ­‰ν΄ν•˜λ‹€": 32,
"λ°‰λ‹€": 33,
"λ°˜κ°‘λ‹€": 34,
"λ°°μ€λ§λ•ν•˜λ‹€": 35,
"버겁닀": 36,
"λ³΄μž˜κ²ƒμ—†λ‹€": 37,
"λΆ€λ„λŸ½λ‹€": 38,
"λΆ€λ‹Ήν•˜λ‹€": 39,
"λΆ€μœ ν•˜λ‹€": 40,
"λΆˆμŒν•˜λ‹€": 41,
"λΆˆν–‰ν•˜λ‹€": 42,
"λΆˆν™•μ‹€ν•˜λ‹€": 43,
"λ»”λ»”μŠ€λŸ½λ‹€": 44,
"λΌˆμ•„ν”„λ‹€": 45,
"μ‚¬λž‘μŠ€λŸ½λ‹€": 46,
"μ„œλŸ½λ‹€": 47,
"μ„­μ„­ν•˜λ‹€": 48,
"μˆ˜μƒν•˜λ‹€": 49,
"μŠ¬ν”„λ‹€": 50,
"μ‹œλ¬΄λ£©ν•˜λ‹€": 51,
"μ‹¬λž€ν•˜λ‹€": 52,
"쓰리닀": 53,
"μ“Έμ“Έν•˜λ‹€": 54,
"μ•„λ‹ˆκΌ½λ‹€": 55,
"μ•„λ“ν•˜λ‹€": 56,
"아쉽닀": 57,
"μ•„μ°”ν•˜λ‹€": 58,
"μ•…λž„ν•˜λ‹€": 59,
"μ•ˆνƒ€κΉλ‹€": 60,
"μ•½ν•˜λ‹€": 61,
"μ–•λ‹€": 62,
"μ–΄λ ΄ν’‹ν•˜λ‹€": 63,
"어리λ‘₯μ ˆν•˜λ‹€": 64,
"μ–΅μšΈν•˜λ‹€": 65,
"μ–Έμ§’λ‹€": 66,
"μ—„λ°€ν•˜λ‹€": 67,
"μ—„μ€‘ν•˜λ‹€": 68,
"μ—‰λš±ν•˜λ‹€": 69,
"μ˜κ΄‘μŠ€λŸ½λ‹€": 70,
"μ˜ˆμ‚¬λ‘­λ‹€": 71,
"μ™Έλ‘­λ‹€": 72,
"μœ„ν—˜ν•˜λ‹€": 73,
"μ˜μ‹¬μŠ€λŸ½λ‹€": 74,
"μžλž‘μŠ€λŸ½λ‹€": 75,
"μžμ„Έν•˜λ‹€": 76,
"μžμœ λ‘­λ‹€": 77,
"μ‘°λ§ˆμ‘°λ§ˆν•˜λ‹€": 78,
"즐겁닀": 79,
"μ§œμ¦μŠ€λŸ½λ‹€": 80,
"μ°½ν”Όν•˜λ‹€": 81,
"μΉ˜μš•μŠ€λŸ½λ‹€": 82,
"ν•œμ‹¬ν•˜λ‹€": 83,
"ν–‰λ³΅ν•˜λ‹€": 84,
"ν˜Όλž€ν•˜λ‹€": 85,
"ν™€κ°€λΆ„ν•˜λ‹€": 86,
"ν™”λͺ©ν•˜λ‹€": 87,
"νλ­‡ν•˜λ‹€": 88,
}
def float32_to_pcm16(waveform: torch.Tensor) -> np.array:
info = np.iinfo(np.int16)
waveform = waveform.numpy()
return (waveform * info.max).clip(info.min, info.max).astype(np.int16)
def endswith_punctuation(text: str) -> bool:
return re.search(r"[.?!]", text) is not None
def synthesize(text: str, emotion: str) -> Tuple[int, np.array]:
text = text.strip()
if not endswith_punctuation(text):
text += "."
print(f"[{datetime.now().isoformat()}] <{emotion}> {text}")
output = model(text, lids=np.array(emotions[emotion]))
return (model.fs, float32_to_pcm16(output["wav"]))
with gr.Blocks() as demo:
gr.Markdown("# μ†‘μž¬μš± TTS ν”„λ‘œμ νŠΈ")
with gr.Row():
with gr.Column():
text = gr.Textbox(
label="ν…μŠ€νŠΈ",
value="κ·Έλ™μ•ˆ μ‚¬λž‘ν•΄μ£Όμ‹ λΆ„λ“€μ—κ²ŒλŠ” κ°μ‚¬ν•˜λ‹€λŠ” 말만 μ „ν•˜κ³ μ‹ΆμŠ΅λ‹ˆλ‹€. 제 방솑 ν–‡μˆ˜λ‘œ 3년이 흐λ₯΄κ³  μ§€κΈˆ μ—¬κΈ°κΉŒμ§€ μ˜€λŠ”λ° ꡉμž₯히 λ…Έλ ₯λ§Žμ΄ν–ˆμŠ΅λ‹ˆλ‹€.",
lines=3,
)
synthesize_button = gr.Button("ν•©μ„±")
with gr.Box():
ouptut = gr.Audio()
gr.Markdown("μŒμ„± λ‹€μš΄λ‘œλ“œλŠ” ν”Œλ ˆμ΄μ–΄ 였λ₯Έμͺ½μ˜ Β·Β·Β· 메뉴 클릭 λΆ€νƒλ“œλ¦½λ‹ˆλ‹€ πŸ™")
with gr.Column():
emotion = gr.Radio(label="감정", choices=[*emotions.keys()], value="λ‹΄λ°±ν•˜λ‹€")
synthesize_button.click(fn=synthesize, inputs=[text, emotion], outputs=ouptut)
demo.launch()