Spaces:
Runtime error
Runtime error
import re | |
from datetime import datetime | |
from typing import Tuple | |
import gradio as gr | |
import numpy as np | |
import torch | |
from espnet2.bin.tts_inference import Text2Speech | |
model = Text2Speech.from_pretrained("yesyesjaewook/jets-jaewook-ko") | |
emotions = { | |
"κ°μΎλ€": 1, | |
"κ°κ°νλ€": 2, | |
"κ°μμ€λ½λ€": 3, | |
"κ²½λ©Έμ€λ½λ€": 4, | |
"κ³ λ§λ€": 5, | |
"κ³ ν΅μ€λ½λ€": 6, | |
"κ³€λνλ€": 7, | |
"κ΄΄λ‘λ€": 8, | |
"κ΅μ₯νλ€": 9, | |
"그립λ€": 10, | |
"κΈνλ€": 11, | |
"κΈ°λ§νλ€": 12, | |
"κΈ΄λ°νλ€": 13, | |
"κΊΌλ¦ΌμΉνλ€": 14, | |
"λμ°νλ€": 15, | |
"λλ°μλ€": 16, | |
"λ¨μ¬μ€λ½λ€": 17, | |
"λμ νλ€": 18, | |
"λ Ήλ Ήνλ€": 19, | |
"λ€κΈνλ€": 20, | |
"λ΄λ°±νλ€": 21, | |
"λλμμ΄νλ€": 22, | |
"λλ ΅λ€": 23, | |
"λ₯κ·Έμ€λ¦νλ€": 24, | |
"λλ ·νλ€": 25, | |
"λ§μ‘±μ€λ½λ€": 26, | |
"λ§νλ€": 27, | |
"맀μ νλ€": 28, | |
"λͺ»λλ€": 29, | |
"무κ΄μ¬νλ€": 30, | |
"무λ‘νλ€": 31, | |
"λν΄νλ€": 32, | |
"λ°λ€": 33, | |
"λ°κ°λ€": 34, | |
"λ°°μλ§λνλ€": 35, | |
"λ²κ²λ€": 36, | |
"보μκ²μλ€": 37, | |
"λΆλλ½λ€": 38, | |
"λΆλΉνλ€": 39, | |
"λΆμ νλ€": 40, | |
"λΆμνλ€": 41, | |
"λΆννλ€": 42, | |
"λΆνμ€νλ€": 43, | |
"λ»λ»μ€λ½λ€": 44, | |
"λΌμνλ€": 45, | |
"μ¬λμ€λ½λ€": 46, | |
"μλ½λ€": 47, | |
"μμνλ€": 48, | |
"μμνλ€": 49, | |
"μ¬νλ€": 50, | |
"μ무룩νλ€": 51, | |
"μ¬λνλ€": 52, | |
"μ°λ¦¬λ€": 53, | |
"μΈμΈνλ€": 54, | |
"μλκΌ½λ€": 55, | |
"μλνλ€": 56, | |
"μμ½λ€": 57, | |
"μμ°νλ€": 58, | |
"μ λνλ€": 59, | |
"μνκΉλ€": 60, | |
"μ½νλ€": 61, | |
"μλ€": 62, | |
"μ΄λ ΄ννλ€": 63, | |
"μ΄λ¦¬λ₯μ νλ€": 64, | |
"μ΅μΈνλ€": 65, | |
"μΈμ§’λ€": 66, | |
"μλ°νλ€": 67, | |
"μμ€νλ€": 68, | |
"μλ±νλ€": 69, | |
"μκ΄μ€λ½λ€": 70, | |
"μμ¬λ‘λ€": 71, | |
"μΈλ‘λ€": 72, | |
"μννλ€": 73, | |
"μμ¬μ€λ½λ€": 74, | |
"μλμ€λ½λ€": 75, | |
"μμΈνλ€": 76, | |
"μμ λ‘λ€": 77, | |
"μ‘°λ§μ‘°λ§νλ€": 78, | |
"μ¦κ²λ€": 79, | |
"μ§μ¦μ€λ½λ€": 80, | |
"μ°½νΌνλ€": 81, | |
"μΉμμ€λ½λ€": 82, | |
"νμ¬νλ€": 83, | |
"ν볡νλ€": 84, | |
"νΌλνλ€": 85, | |
"νκ°λΆνλ€": 86, | |
"νλͺ©νλ€": 87, | |
"νλνλ€": 88, | |
} | |
def float32_to_pcm16(waveform: torch.Tensor) -> np.array: | |
info = np.iinfo(np.int16) | |
waveform = waveform.numpy() | |
return (waveform * info.max).clip(info.min, info.max).astype(np.int16) | |
def endswith_punctuation(text: str) -> bool: | |
return re.search(r"[.?!]", text) is not None | |
def synthesize(text: str, emotion: str) -> Tuple[int, np.array]: | |
text = text.strip() | |
if not endswith_punctuation(text): | |
text += "." | |
print(f"[{datetime.now().isoformat()}] <{emotion}> {text}") | |
output = model(text, lids=np.array(emotions[emotion])) | |
return (model.fs, float32_to_pcm16(output["wav"])) | |
with gr.Blocks() as demo: | |
gr.Markdown("# μ‘μ¬μ± TTS νλ‘μ νΈ") | |
with gr.Row(): | |
with gr.Column(): | |
text = gr.Textbox( | |
label="ν μ€νΈ", | |
value="κ·Έλμ μ¬λν΄μ£Όμ λΆλ€μκ²λ κ°μ¬νλ€λ λ§λ§ μ νκ³ μΆμ΅λλ€. μ λ°©μ‘ νμλ‘ 3λ μ΄ νλ₯΄κ³ μ§κΈ μ¬κΈ°κΉμ§ μ€λλ° κ΅μ₯ν λ Έλ ₯λ§μ΄νμ΅λλ€.", | |
lines=3, | |
) | |
synthesize_button = gr.Button("ν©μ±") | |
with gr.Box(): | |
ouptut = gr.Audio() | |
gr.Markdown("μμ± λ€μ΄λ‘λλ νλ μ΄μ΄ μ€λ₯Έμͺ½μ Β·Β·Β· λ©λ΄ ν΄λ¦ λΆνλ립λλ€ π") | |
with gr.Column(): | |
emotion = gr.Radio(label="κ°μ ", choices=[*emotions.keys()], value="λ΄λ°±νλ€") | |
synthesize_button.click(fn=synthesize, inputs=[text, emotion], outputs=ouptut) | |
demo.launch() | |