mms-tts-demo / app.py
Matthijs Hollemans
here we go!
f0839e8
raw
history blame contribute delete
No virus
5.02 kB
import gradio as gr
import numpy as np
import torch
import os
import re
import tempfile
from transformers import VitsModel, VitsTokenizer
models = {
"English": VitsModel.from_pretrained("Matthijs/mms-tts-eng"),
"German": VitsModel.from_pretrained("Matthijs/mms-tts-deu"),
"Korean": VitsModel.from_pretrained("Matthijs/mms-tts-kor"),
}
tokenizers = {
"English": VitsTokenizer.from_pretrained("Matthijs/mms-tts-eng"),
"German": VitsTokenizer.from_pretrained("Matthijs/mms-tts-deu"),
"Korean": VitsTokenizer.from_pretrained("Matthijs/mms-tts-kor"),
}
# For certain checkpoints, the text needs to be romanized.
# MMS-TTS uses uromanize.pl for this from https://github.com/isi-nlp/uroman
# This needs to be installed in the folder "uroman"
def uromanize(text, uroman_pl):
iso = "xxx"
with tempfile.NamedTemporaryFile() as tf, tempfile.NamedTemporaryFile() as tf2:
with open(tf.name, "w") as f:
f.write("\n".join([text]))
cmd = f"perl " + uroman_pl
cmd += f" -l {iso} "
cmd += f" < {tf.name} > {tf2.name}"
os.system(cmd)
outtexts = []
with open(tf2.name) as f:
for line in f:
line = re.sub(r"\s+", " ", line).strip()
outtexts.append(line)
outtext = outtexts[0]
return outtext
def predict(text, language=None):
if len(text.strip()) == 0:
return (16000, np.zeros(0).astype(np.int16))
if language == "Korean":
uroman_pl = os.path.join("uroman", "bin", "uroman.pl")
text = uromanize(text, uroman_pl)
tokenizer = tokenizers[language]
inputs = tokenizer(text, return_tensors="pt")
input_ids = inputs["input_ids"]
if language != "Korean":
text = tokenizer.batch_decode(input_ids)[0]
model = models[language]
with torch.no_grad():
outputs = model(input_ids)
speech = outputs.audio[0]
speech = (speech.numpy() * 32767).astype(np.int16)
return (16000, speech), text
title = "MMS-TTS speech synthesis"
description = """
Facebook's [Massively Multilingual Speech](https://arxiv.org/abs/2305.13516) project aims to provide
speech technology across a diverse range of languages. The MMS-TTS project contains a collection of
over 1000 text-to-speech (TTS) models.
This demo shows how to use MMS-TTS using 🤗 Transformers. Since MMS-TTS is based on the VITS
model, this code can also be used to run VITS checkpoints.
For a full list of checkpoints, [click here](https://huggingface.co/models?filter=vits).
As the model performs random sampling, the generated speech is slightly different each time.
The voice may also vary between runs, or sometimes even in the same sentence.
(Note that 🤗 Transformers also supports multispeaker VITS checkpoints but the MMS-TTS checkpoints
are not conditioned on a speaker ID.)
"""
article = """
<div style='margin:20px auto;'>
<p>References: <a href="https://arxiv.org/abs/2305.13516">MMS paper</a> |
<a href="https://ai.facebook.com/blog/multilingual-model-speech-recognition/">blog post</a> |
<a href="https://huggingface.co/facebook/mms-tts">original weights</a> |
<a href="https://huggingface.co/spaces/mms-meta/MMS">original MMS space</a>
</p>
<pre>
@article{pratap2023mms,
title={Scaling Speech Technology to 1,000+ Languages},
author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli},
journal={arXiv},
year={2023}
}
</pre>
</div>
"""
examples = [
["It is not in the stars to hold our destiny but in ourselves.", "English"],
["The octopus and Oliver went to the opera in October.", "English"],
["She sells seashells by the seashore. I saw a kitten eating chicken in the kitchen.", "English"],
["Brisk brave brigadiers brandished broad bright blades, blunderbusses, and bludgeons—balancing them badly.", "English"],
["A synonym for cinnamon is a cinnamon synonym.", "English"],
["How much wood would a woodchuck chuck if a woodchuck could chuck wood?", "English"],
["Eins, zwei, Polizei. Drei, vier, Grenadier. Fünf, sechs, alte Keks. Sieben, acht, gute Nacht.", "German"],
["Alle meine Entchen, schwimmen auf dem See. Köpfchen in das Wasser, Schwänzchen in die Höh.", "German"],
["안녕 세상, 날씨는 아름다워", "Korean"], # Hello world, the weather is beautiful (Google Translate)
]
gr.Interface(
fn=predict,
inputs=[
gr.Text(label="Input Text"),
gr.Radio(label="Language", choices=[
"English",
"German",
"Korean",
],
value="English"),
],
outputs=[
gr.Audio(label="Generated Speech", type="numpy"),
gr.Text(label="Processed text"),
],
title=title,
description=description,
article=article,
examples=examples,
).launch()