Spaces:
Sleeping
Sleeping
from transformers import VitsModel, AutoTokenizer | |
import torch | |
# import scipy | |
import gradio as gr | |
def tts(lang, text): | |
model = VitsModel.from_pretrained(f"facebook/mms-tts-{lang}") | |
tokenizer = AutoTokenizer.from_pretrained(f"facebook/mms-tts-{lang}") | |
# text = "ngaiˇ hau siid hon boˋ lauˊ goˋ ziibˋ" | |
text = f"{text}" | |
inputs = tokenizer(text, return_tensors="pt") | |
# inputs["input_ids"] = inputs["input_ids"].long() | |
print("Input shape:", inputs["input_ids"].shape) | |
with torch.no_grad(): | |
output = model(**inputs).waveform | |
output_np = output.squeeze().cpu().numpy() # 去除多餘維度並轉為 NumPy | |
# file = scipy.io.wavfile.write(f"{fileName}.wav", rate=model.config.sampling_rate, data=output_np) | |
return (model.config.sampling_rate, output_np) # 返回取樣率和 NumPy 音訊數據 | |
iface = gr.Interface( | |
fn=tts, | |
inputs=[ | |
gr.Dropdown(choices=["eng", "hak", "deu"], label="選擇語言 (Language)"), # 語言選擇下拉式選單 | |
gr.Textbox(label="輸入文字 (Text)"), # 輸入的文字 | |
# gr.Textbox(label="檔案名稱 (File Name)"), # 輸入檔案名稱 | |
], | |
outputs=gr.Audio(type="numpy", label="語音輸出"), # 輸出語音 | |
allow_flagging="never" # 移除 Flag 按鈕 | |
) | |
iface.launch() |