mms-tts-demo / app.py
karim23657's picture
Duplicate from Matthijs/mms-tts-demo
1c3a7ff
raw
history blame
5.02 kB
import gradio as gr
import numpy as np
import torch
import os
import re
import tempfile
from transformers import VitsModel, VitsTokenizer
models = {
"English": VitsModel.from_pretrained("Matthijs/mms-tts-eng"),
"German": VitsModel.from_pretrained("Matthijs/mms-tts-deu"),
"Korean": VitsModel.from_pretrained("Matthijs/mms-tts-kor"),
}
tokenizers = {
"English": VitsTokenizer.from_pretrained("Matthijs/mms-tts-eng"),
"German": VitsTokenizer.from_pretrained("Matthijs/mms-tts-deu"),
"Korean": VitsTokenizer.from_pretrained("Matthijs/mms-tts-kor"),
}
# For certain checkpoints, the text needs to be romanized.
# MMS-TTS uses uromanize.pl for this from https://github.com/isi-nlp/uroman
# This needs to be installed in the folder "uroman"
def uromanize(text, uroman_pl):
iso = "xxx"
with tempfile.NamedTemporaryFile() as tf, tempfile.NamedTemporaryFile() as tf2:
with open(tf.name, "w") as f:
f.write("\n".join([text]))
cmd = f"perl " + uroman_pl
cmd += f" -l {iso} "
cmd += f" < {tf.name} > {tf2.name}"
os.system(cmd)
outtexts = []
with open(tf2.name) as f:
for line in f:
line = re.sub(r"\s+", " ", line).strip()
outtexts.append(line)
outtext = outtexts[0]
return outtext
def predict(text, language=None):
if len(text.strip()) == 0:
return (16000, np.zeros(0).astype(np.int16))
if language == "Korean":
uroman_pl = os.path.join("uroman", "bin", "uroman.pl")
text = uromanize(text, uroman_pl)
tokenizer = tokenizers[language]
inputs = tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"]
if language != "Korean":
text = tokenizer.batch_decode(input_ids)[0]
model = models[language]
with torch.no_grad():
outputs = model(input_ids)
speech = outputs.audio[0]
speech = (speech.numpy() * 32767).astype(np.int16)
return (16000, speech), text
title = "MMS-TTS speech synthesis"
description = """
Facebook's [Massively Multilingual Speech](https://arxiv.org/abs/2305.13516) project aims to provide
speech technology across a diverse range of languages. The MMS-TTS project contains a collection of
over 1000 text-to-speech (TTS) models.
This demo shows how to use MMS-TTS using 🤗 Transformers. Since MMS-TTS is based on the VITS
model, this code can also be used to run VITS checkpoints.
For a full list of checkpoints, [click here](https://huggingface.co/models?filter=vits).
As the model performs random sampling, the generated speech is slightly different each time.
The voice may also vary between runs, or sometimes even in the same sentence.
(Note that 🤗 Transformers also supports multispeaker VITS checkpoints but the MMS-TTS checkpoints
are not conditioned on a speaker ID.)
"""
article = """
<div style='margin:20px auto;'>
<p>References: <a href="https://arxiv.org/abs/2305.13516">MMS paper</a> |
<a href="https://ai.facebook.com/blog/multilingual-model-speech-recognition/">blog post</a> |
<a href="https://huggingface.co/facebook/mms-tts">original weights</a> |
<a href="https://huggingface.co/spaces/mms-meta/MMS">original MMS space</a>
</p>
<pre>
@article{pratap2023mms,
title={Scaling Speech Technology to 1,000+ Languages},
author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
journal={arXiv},
year={2023}
}
</pre>
</div>
"""
examples = [
["It is not in the stars to hold our destiny but in ourselves.", "English"],
["The octopus and Oliver went to the opera in October.", "English"],
["She sells seashells by the seashore. I saw a kitten eating chicken in the kitchen.", "English"],
["Brisk brave brigadiers brandished broad bright blades, blunderbusses, and bludgeons—balancing them badly.", "English"],
["A synonym for cinnamon is a cinnamon synonym.", "English"],
["How much wood would a woodchuck chuck if a woodchuck could chuck wood?", "English"],
["Eins, zwei, Polizei. Drei, vier, Grenadier. Fünf, sechs, alte Keks. Sieben, acht, gute Nacht.", "German"],
["Alle meine Entchen, schwimmen auf dem See. Köpfchen in das Wasser, Schwänzchen in die Höh.", "German"],
["안녕 세상, 날씨는 아름다워", "Korean"], # Hello world, the weather is beautiful (Google Translate)
]
gr.Interface(
fn=predict,
inputs=[
gr.Text(label="Input Text"),
gr.Radio(label="Language", choices=[
"English",
"German",
"Korean",
],
value="English"),
],
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
gr.Text(label="Processed text"),
],
title=title,
description=description,
article=article,
examples=examples,
).launch()