|
import torch |
|
import scipy |
|
import os |
|
import streamlit as st |
|
import pandas as pd |
|
from transformers import pipeline |
|
from transformers import VitsTokenizer, VitsModel |
|
from datasets import load_dataset, Audio |
|
from huggingface_hub.inference_api import InferenceApi |
|
|
|
from src import * |
|
|
|
|
|
|
|
col1, col2 = st.columns([20,3]) |
|
with col2: |
|
st.image('logo.png', use_container_width=True) |
|
with col1: |
|
st.title("Mockingbird") |
|
st.header("A demo of open Text to Speech tools") |
|
|
|
tts, about = st.tabs(["Text to speech", "**About**"]) |
|
|
|
|
|
with tts: |
|
|
|
|
|
tts_lang = st.selectbox('Language of text', (language_list), format_func = decode_iso) |
|
tts_text = st.text_area(label = "Please enter your sentence here:", |
|
value="", placeholder=placeholders[tts_lang] ) |
|
|
|
target_speaker_file = st.file_uploader("If you would like to test voice conversion, you may upload your audio below. You should upload one file in .wav format. If you don't, a default file will be used.", |
|
type=['wav']) |
|
|
|
|
|
if st.button("Generate"): |
|
|
|
|
|
if tts_lang in ['rus', 'fas']: |
|
st.warning("WARNING! On Windows, ESpeak-NG has trouble synthesizing output when input is provided from non-Latin alphabets.") |
|
|
|
st.divider() |
|
|
|
|
|
with st.spinner(":rainbow[Synthesizing, please wait... (this will be slowest the first time you generate audio in a new language)]"): |
|
if tts_text == "": |
|
tts_text=placeholders[tts_lang] |
|
|
|
|
|
base_mms = synth_mms(tts_text, models[tts_lang]['mms']) |
|
base_coqui= synth_coqui(tts_text, models[tts_lang]['coqui']) |
|
base_espeakng= synth_espeakng(tts_text, models[tts_lang]['espeakng']) |
|
try: |
|
base_toucan= synth_toucan(tts_text, models[tts_lang]['toucan']) |
|
except: |
|
base_toucan=None |
|
base_piper= synth_piper(tts_text, models[tts_lang]['piper']) |
|
|
|
if tts_lang=="swh": |
|
finetuned_mms1 = synth_mms(tts_text, "khof312/mms-tts-swh-female-1") |
|
finetuned_mms2 = synth_mms(tts_text, "khof312/mms-tts-swh-female-2") |
|
if tts_lang=="spa": |
|
finetuned_mms1 = synth_mms(tts_text, "ylacombe/mms-spa-finetuned-argentinian-monospeaker") |
|
finetuned_mms2 = synth_mms(tts_text, "ylacombe/mms-spa-finetuned-chilean-monospeaker") |
|
finetuned_mms3 = synth_mms(tts_text, "ylacombe/mms-spa-finetuned-colombian-monospeaker") |
|
finetuned_mms4 = synth_mms(tts_text, "khof312/mms-tts-spa-female") |
|
if tts_lang=="lin": |
|
finetuned_mms1 = synth_mms(tts_text, "khof312/mms-tts-lin-female") |
|
try: |
|
finetuned_africanvoices = synth_africanvoices(tts_text, models[tts_lang]['africanvoices']) |
|
except: |
|
pass |
|
|
|
|
|
|
|
|
|
"## Synthesis" |
|
"### Default models" |
|
row1 = st.columns([1,1,2]) |
|
row2 = st.columns([1,1,2]) |
|
row3 = st.columns([1,1,2]) |
|
row4 = st.columns([1,1,2]) |
|
row5 = st.columns([1,1,2]) |
|
row6 = st.columns([1,1,2]) |
|
|
|
row1[0].write("**Model**") |
|
row1[1].write("**Configuration**") |
|
row1[2].write("**Audio**") |
|
|
|
if base_mms is not None: |
|
row2[0].write(f"[Meta MMS](https://huggingface.co/docs/transformers/main/en/model_doc/mms)") |
|
row2[1].write("default") |
|
row2[2].audio(base_mms[0], sample_rate = base_mms[1]) |
|
|
|
if base_coqui is not None: |
|
row3[0].write(f"[Coqui](https://docs.coqui.ai/en/latest/index.html)") |
|
row3[1].write("default") |
|
row3[2].audio(base_coqui[0], sample_rate = base_coqui[1]) |
|
|
|
if base_espeakng is not None: |
|
row4[0].write(f"[Espeak-ng](https://github.com/espeak-ng/espeak-ng)") |
|
row4[1].write("default") |
|
row4[2].audio(base_espeakng[0], sample_rate = base_espeakng[1]) |
|
|
|
if base_toucan is not None: |
|
row5[0].write(f"[IMS-Toucan](https://github.com/DigitalPhonetics/IMS-Toucan)") |
|
row5[1].write("default") |
|
row5[2].audio(base_toucan[0], sample_rate = base_toucan[1]) |
|
|
|
if base_piper is not None: |
|
row6[0].write(f"[Piper](https://github.com/rhasspy/piper)") |
|
row6[1].write("default") |
|
row6[2].audio(base_piper[0], sample_rate = base_piper[1]) |
|
|
|
|
|
if tts_lang == "swh": |
|
"### Fine Tuned" |
|
row1 = st.columns([1,1,2]) |
|
row2 = st.columns([1,1,2]) |
|
row3 = st.columns([1,1,2]) |
|
|
|
row1[0].write("**Model**") |
|
row1[1].write("**Configuration**") |
|
row1[2].write("**Audio**") |
|
|
|
row2[0].write(f"Meta MMS") |
|
row2[1].write("[female 1](https://huggingface.co/khof312/mms-tts-swh-female-1)") |
|
row2[2].audio(finetuned_mms1[0], sample_rate = finetuned_mms1[1]) |
|
row3[0].write(f"Meta MMS") |
|
row3[1].write("[female 2](https://huggingface.co/khof312/mms-tts-swh-female-2)") |
|
row3[2].audio(finetuned_mms2[0], sample_rate = finetuned_mms2[1]) |
|
|
|
|
|
if tts_lang == "spa": |
|
"### Fine Tuned" |
|
row1 = st.columns([1,1,2]) |
|
row2 = st.columns([1,1,2]) |
|
row3 = st.columns([1,1,2]) |
|
row4 = st.columns([1,1,2]) |
|
row5 = st.columns([1,1,2]) |
|
|
|
row1[0].write("**Model**") |
|
row1[1].write("**Configuration**") |
|
row1[2].write("**Audio**") |
|
|
|
row2[0].write(f"Meta MMS") |
|
row2[1].write("[ylacombe - Argentinian](https://huggingface.co/ylacombe/mms-spa-finetuned-argentinian-monospeaker)") |
|
row2[2].audio(finetuned_mms1[0], sample_rate = finetuned_mms1[1]) |
|
row3[0].write(f"Meta MMS") |
|
row3[1].write("[ylacombe - Chilean](https://huggingface.co/ylacombe/mms-spa-finetuned-chilean-monospeaker)") |
|
row3[2].audio(finetuned_mms2[0], sample_rate = finetuned_mms2[1]) |
|
row4[0].write(f"Meta MMS") |
|
row4[1].write("[ylacombe - Colombian](https://huggingface.co/ylacombe/mms-spa-finetuned-colombian-monospeaker)") |
|
row4[2].audio(finetuned_mms3[0], sample_rate = finetuned_mms3[1]) |
|
row5[0].write(f"Meta MMS") |
|
row5[1].write("[khof312 - female](https://huggingface.co/khof312/mms-tts-spa-female)") |
|
row5[2].audio(finetuned_mms4[0], sample_rate = finetuned_mms4[1]) |
|
|
|
if tts_lang == "lin": |
|
"### Fine Tuned" |
|
row1 = st.columns([1,1,2]) |
|
row2 = st.columns([1,1,2]) |
|
row3 = st.columns([1,1,2]) |
|
|
|
row1[0].write("**Model**") |
|
row1[1].write("**Configuration**") |
|
row1[2].write("**Audio**") |
|
|
|
row2[0].write(f"Meta MMS") |
|
row2[1].write("[khof312 - female](https://huggingface.co/khof312/mms-tts-lin-female)") |
|
row2[2].audio(finetuned_mms1[0], sample_rate = finetuned_mms1[1]) |
|
|
|
try: |
|
row3[0].write(f"African voices") |
|
row3[1].write("[African Voices]()") |
|
row3[2].audio(finetuned_africanvoices[0], sample_rate = finetuned_africanvoices[1]) |
|
except: |
|
pass |
|
|
|
st.divider() |
|
|
|
"## Voice conversion" |
|
|
|
|
|
|
|
st.warning('''Note: The naturalness of the audio will only be as good as that of the audio in "default models" above.''') |
|
|
|
if target_speaker_file is not None: |
|
rate, wav = scipy.io.wavfile.read(target_speaker_file) |
|
scipy.io.wavfile.write("target_speaker_custom.wav", data=wav, rate=rate) |
|
target_speaker = "target_speaker_custom.wav" |
|
else: |
|
target_speaker = "target_speaker.wav" |
|
|
|
if base_mms is not None: |
|
scipy.io.wavfile.write("source_speaker_mms.wav", rate=base_mms[1], data=base_mms[0].T) |
|
converted_mms = convert_coqui('source_speaker_mms.wav', target_speaker) |
|
|
|
if base_coqui is not None: |
|
scipy.io.wavfile.write("source_speaker_coqui.wav", rate=base_coqui[1], data=base_coqui[0].T) |
|
converted_coqui = convert_coqui('source_speaker_coqui.wav', target_speaker) |
|
|
|
if base_espeakng is not None: |
|
scipy.io.wavfile.write("source_speaker_espeakng.wav", rate=base_espeakng[1], data=base_espeakng[0].T) |
|
converted_espeakng = convert_coqui('source_speaker_espeakng.wav', target_speaker) |
|
|
|
scipy.io.wavfile.write("source_speaker_toucan.wav", rate=base_toucan[1], data=base_toucan[0].T) |
|
converted_toucan = convert_coqui('source_speaker_toucan.wav', target_speaker) |
|
|
|
row1 = st.columns([1,1,2]) |
|
row2 = st.columns([1,1,2]) |
|
row3 = st.columns([1,1,2]) |
|
row4 = st.columns([1,1,2]) |
|
|
|
row1[0].write("**Model**") |
|
row1[1].write("**Configuration**") |
|
row1[2].write("**Audio**") |
|
|
|
if base_mms is not None: |
|
row1[0].write(f"Meta MMS") |
|
row1[1].write(f"converted") |
|
row1[2].audio(converted_mms[0], sample_rate = converted_mms[1]) |
|
|
|
if base_coqui is not None: |
|
row2[0].write(f"Coqui") |
|
row2[1].write(f"converted") |
|
row2[2].audio(converted_coqui[0], sample_rate = converted_coqui[1]) |
|
|
|
if base_espeakng is not None: |
|
row3[0].write(f"Espeak-ng") |
|
row3[1].write(f"converted") |
|
row3[2].audio(converted_espeakng[0], sample_rate = converted_espeakng[1]) |
|
|
|
|
|
row4[0].write(f"IMS Toucan") |
|
row4[1].write(f"converted") |
|
row4[2].audio(converted_toucan[0], sample_rate = converted_toucan[1]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with about: |
|
|
|
st.markdown('''# Mockingbird TTS Demo |
|
This page is a demo of the openly available Text to Speech models for various languages of interest. Currently, 3 synthesizers with multilingual offerings are supported out of the box: |
|
- [**Meta's Massively Multilingual Speech (MMS)**](https://ai.meta.com/blog/multilingual-model-speech-recognition/) model, which supports over 1000 languages.[^1] |
|
- [**IMS Toucan**](https://github.com/DigitalPhonetics/IMS-Toucan), which supports 7000 languages.[^4] |
|
- [**ESpeak-NG's**](https://github.com/espeak-ng/espeak-ng/tree/master)'s synthetic voices**[^3] |
|
|
|
On a case-by-case basis, for different languages of interest, I have added: |
|
- [**Coqui's TTS**](https://docs.coqui.ai/en/latest/#) package;[^2] while no longer supported, Coqui acted as a hub for TTS model hosting and these models are still available. Languages must be added on a model-by-model basis. |
|
- Specific fine-tuned variants of Meta's MMS (either fine-tuned by [Yoach Lacombe](https://huggingface.co/ylacombe), or fine-tuned by me using his scripts). |
|
|
|
I am in the process of adding support for: |
|
- [**Piper**](https://github.com/rhasspy/piper), a TTS system that supports multiple voices per language and approximately 30 languages. To test different voices, please see the [Huggingface demo](https://huggingface.co/spaces/k2-fsa/text-to-speech).[^5] |
|
- [**African Voices**](https://github.com/neulab/AfricanVoices), a CMU research project that fine-tuned synthesizers for different African languages. The site hosting the synthesizers is deprecated but they can be downloaded from Google's Wayback Machine. [^6] |
|
|
|
|
|
Voice conversion is currently achieved through Coqui. |
|
|
|
Notes: |
|
1. ESpeak-NG seems to have the worst performance out of the box, but it has a lot of options for controlling voice output. |
|
2. Where a synthesizer supports multiple models/voices, I manually pick the appropriate model. |
|
3. Not all synthesizers support a given language. |
|
|
|
|
|
|
|
[^1]: Endpoints used are of the form https://huggingface.co/facebook/mms-tts-[LANG]. |
|
Learn more: |
|
[Docs](https://huggingface.co/docs/transformers/model_doc/mms) | |
|
[Paper](https://arxiv.org/abs/2305.13516) | |
|
[Supported languages](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) |
|
|
|
[^2]: [Available models](https://github.com/coqui-ai/TTS/blob/dev/TTS/.models.json) |
|
[^3]: [Language list](https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md) |
|
[^4]: Language list is available in the Gradio API documentation [here](https://huggingface.co/spaces/Flux9665/MassivelyMultilingualTTS). |
|
[^5]: The list of available voices is [here](https://github.com/rhasspy/piper/blob/master/VOICES.md), model checkpoints are [here](https://huggingface.co/datasets/rhasspy/piper-checkpoints/tree/main), and they can be tested [here](https://rhasspy.github.io/piper-samples/). |
|
[^6]: |
|
''') |
|
|
|
|
|
|