tts_mockingbird

Running

App Files Files Community

khof312 commited on Jul 26, 2024

Commit

f5cf172

1 Parent(s): a64625a

Load initial demo.

Browse files

Files changed (10) hide show

.gitignore +167 -0
README.md +14 -11
app.py +194 -0
requirements.txt +7 -0
src/__init__.py +4 -0
src/convert.py +21 -0
src/helpers.py +29 -0
src/lookups.py +72 -0
src/synthesize.py +85 -0
target_speaker.wav +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,167 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# MY FILES
+dev_roadmap.txt

README.md CHANGED Viewed

@@ -1,12 +1,15 @@
----
-title: Tts Mockingbird
-emoji: 🐠
-colorFrom: blue
-colorTo: yellow
-sdk: streamlit
-sdk_version: 1.37.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Mockingbird TTS Demo
+This repo hosts Mockingbird, a demo of open Text-to-Speech tools.
+Currently, 3 synthesizers are supported:
+- [**Meta's Massively Multilingual Speech (MMS)**](https://ai.meta.com/blog/multilingual-model-speech-recognition/) model
+- [**Coqui's TTS**](https://docs.coqui.ai/en/latest/#) package and the models supplied via that
+- [**ESpeak-NG's**](espeak-ng) synthetic voices
+Voice conversion is achieved through Coqui.
+Notes:
+1. ESpeak-NG seems to have the worst performance out of the box, but it has a lot of options for controlling voice output.
+2. Coqui is no longer being officially developed.
+3. Where a synthesizer supports multiple models/voices, I manually pick the appropriate model.
+4. Not all synthesizers support a given language.

app.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import torch
+import scipy
+import os
+import streamlit as st
+import pandas as pd
+from transformers import pipeline #set_seed,
+from transformers import VitsTokenizer, VitsModel
+from datasets import load_dataset, Audio
+from huggingface_hub.inference_api import InferenceApi
+from src import *
+########################
+st.title("Mockingbird")
+st.header("A demo of open Text to Speech tools")
+tts, about = st.tabs(["Text to speech",   "**About**"])
+########################
+with tts:
+    # Configurations -- language choice and text
+    tts_lang = st.selectbox('Language of text', (language_list), format_func = decode_iso)
+    tts_text = st.text_area(label = "Please enter your sentence here:",
+                            value="", placeholder=placeholders[tts_lang] )
+    target_speaker_file = st.file_uploader("If you would like to test voice conversion, you may upload your audio below. You should upload one file in .wav format. If you don't, a default file will be used.",
+type=['wav'])
+    # Inference
+    if st.button("Generate"):
+        # Warning about alphabet support
+        if tts_lang in ['rus', 'fas']:
+            st.warning("WARNING! On Windows, ESpeak-NG has trouble synthesizing output when input is provided from non-Latin alphabets.")
+        st.divider()
+        # Synthesis
+        with st.spinner(":rainbow[Synthesizing, please wait... (this will be slowest the first time you generate audio in a new language)]"):
+            if tts_text == "":
+                tts_text=placeholders[tts_lang]
+            # First, make the audio
+            base_mms = synth_mms(tts_text, models[tts_lang]['mms'])
+            base_coqui= synth_coqui(tts_text, models[tts_lang]['coqui'])
+            base_espeakng= synth_espeakng(tts_text, models[tts_lang]['espeakng'])
+            if tts_lang=="swh":
+                finetuned_mms1 = synth_mms(tts_text, "khof312/mms-tts-swh-female-1")
+                finetuned_mms2 = synth_mms(tts_text, "khof312/mms-tts-swh-female-2")
+            #vc_mms
+            #vc_coqui
+            #vc_espeakng
+            "## Synthesis"
+            "### Default models"
+            row1 = st.columns([1,1,2])
+            row2 = st.columns([1,1,2])
+            row3 = st.columns([1,1,2])
+            row4 = st.columns([1,1,2])
+            row1[0].write("**Model**")
+            row1[1].write("**Configuration**")
+            row1[2].write("**Audio**")
+            if base_mms is not None:
+                row2[0].write(f"Meta MMS")
+                row2[1].write("default")
+                row2[2].audio(base_mms[0], sample_rate = base_mms[1])
+            if base_coqui is not None:
+                row3[0].write(f"Coqui")
+                row3[1].write("default")
+                row3[2].audio(base_coqui[0], sample_rate = base_coqui[1])
+            if base_espeakng is not None:
+                row4[0].write(f"Espeak-ng")
+                row4[1].write("default")
+                row4[2].audio(base_espeakng[0], sample_rate = base_espeakng[1])
+            #################################################################
+            if tts_lang == "swh":
+                "### Fine Tuned"
+                row1 = st.columns([1,1,2])
+                row2 = st.columns([1,1,2])
+                row3 = st.columns([1,1,2])
+                row1[0].write("**Model**")
+                row1[1].write("**Configuration**")
+                row1[2].write("**Audio**")
+                row2[0].write(f"Meta MMS")
+                row2[1].write("female 1")
+                row2[2].audio(finetuned_mms1[0], sample_rate = finetuned_mms1[1])
+                row3[0].write(f"Meta MMS")
+                row3[1].write("female 2")
+                row3[2].audio(finetuned_mms2[0], sample_rate = finetuned_mms2[1])
+            st.divider()
+            "## Voice conversion" #################################################################
+            st.warning('''Note: The naturalness of the audio will only be as good as that of the audio in "default models" above.''')
+            if target_speaker_file is not None:
+                rate, wav = scipy.io.wavfile.read(target_speaker_file)
+                scipy.io.wavfile.write("target_speaker_custom.wav", data=wav, rate=rate)
+                target_speaker = "target_speaker_custom.wav"
+            else:
+                target_speaker = "target_speaker.wav"
+            if base_mms is not None:
+                scipy.io.wavfile.write("source_speaker_mms.wav", rate=base_mms[1], data=base_mms[0].T)
+                converted_mms = convert_coqui('source_speaker_mms.wav', target_speaker)
+            if base_coqui is not None:
+                scipy.io.wavfile.write("source_speaker_coqui.wav", rate=base_coqui[1], data=base_coqui[0].T)
+                converted_coqui = convert_coqui('source_speaker_coqui.wav', target_speaker)
+            if base_espeakng is not None:
+                scipy.io.wavfile.write("source_speaker_espeakng.wav", rate=base_espeakng[1], data=base_espeakng[0].T)
+                converted_espeakng = convert_coqui('source_speaker_espeakng.wav', target_speaker)
+            row1 = st.columns([1,1,2])
+            row2 = st.columns([1,1,2])
+            row3 = st.columns([1,1,2])
+            row1[0].write("**Model**")
+            row1[1].write("**Configuration**")
+            row1[2].write("**Audio**")
+            if base_mms is not None:
+                row1[0].write(f"Meta MMS")
+                row1[1].write(f"converted")
+                row1[2].audio(converted_mms[0], sample_rate = converted_mms[1])
+            if base_coqui is not None:
+                row2[0].write(f"Coqui")
+                row2[1].write(f"converted")
+                row2[2].audio(converted_coqui[0], sample_rate = converted_coqui[1])
+            if base_espeakng is not None:
+                row3[0].write(f"Espeak-ng")
+                row3[1].write(f"converted")
+                row3[2].audio(converted_espeakng[0], sample_rate = converted_espeakng[1])
+                #row3[0].write("MMS-TTS-SWH")
+                #row3[1].audio(synth, sample_rate=16_000)
+                #row3[2].audio(synth, sample_rate=16_000)
+                #st.audio(synth, sample_rate=16_000)
+                #data.write(np.random.randn(10, 1)
+                #col1.subheader("A wide column with a chart")
+                #col1.line_chart(data)
+                #col2.subheader("A narrow column with the data")
+                #col2.write(data)
+with about:
+    #st.header("How it works")
+    st.markdown('''# Mockingbird TTS Demo
+This page is a demo of the openly available Text to Speech models for various languages of interest. Currently, 3 synthesizers are supported:
+- [**Meta's Massively Multilingual Speech (MMS)**](https://ai.meta.com/blog/multilingual-model-speech-recognition/) model, which supports over 1000 languages.[^1]
+- [**Coqui's TTS**](https://docs.coqui.ai/en/latest/#) package;[^2] while no longer supported, Coqui acted as a hub for TTS model hosting and these models are still available.
+- [**ESpeak-NG's**](https://github.com/espeak-ng/espeak-ng/tree/master)'s synthetic voices**[^3]
+Voice conversion is achieved through Coqui.
+Notes:
+1. ESpeak-NG seems to have the worst performance out of the box, but it has a lot of options for controlling voice output.
+2. Where a synthesizer supports multiple models/voices, I manually pick the appropriate model.
+3. Not all synthesizers support a given language.
+[^1]: Endpoints used are of the form https://huggingface.co/facebook/mms-tts-[LANG].
+    Learn more:
+    [Docs](https://huggingface.co/docs/transformers/model_doc/mms) |
+    [Paper](https://arxiv.org/abs/2305.13516) |
+    [Supported languages](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html)
+[^2]: [Available models](https://github.com/coqui-ai/TTS/blob/dev/TTS/.models.json)
+[^3]: [Language list](https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md)
+''')

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+datasets
+librosa
+pycountry
+scipy
+sentencepiece
+transformers
+torch

src/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .helpers import *
+from .lookups import *
+from .synthesize import *
+from .convert import *

src/convert.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+import IPython
+from TTS.api import TTS
+def convert_coqui(source_wav:str, target_wav:str):
+    '''
+    Use Coqui TTS for zero-shot voice conversion.
+    Inputs:
+        source_wav: Wav of the thing you want to say.
+        target_wav: Wav of the speaker you want to hear.
+    Returns:
+        Streaming wav and sampling rate.
+    '''
+    # Get device
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False).to(device)
+    wav = tts.voice_conversion(source_wav=source_wav, target_wav=target_wav)
+    return wav, 24000 # Identified sampling rate of freevc24

src/helpers.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import pycountry
+iso_encoder = {"English":"eng",
+       "French":"fra",
+       "Moore": "mos"}
+iso_decoder = dict((v,k) for k,v in iso_encoder.items())
+def encode_iso(lang:str)-> str:
+    '''   Takes the name of a language and returns its ISO-3 code.   '''
+    return  iso_encoder[lang]
+def decode_iso(iso:str)-> str:
+    '''  Takes an ISO-3 code and returns the name of the language.   '''
+    if "-" in iso:
+        iso, suffix = iso.split("-", 1)
+    else:
+        suffix = None
+    name = pycountry.languages.get(alpha_3 = iso).name
+    name = name.replace("Mossi", "Mooré").replace("Swahili (individual language)", "Swahili")
+    if suffix is not None:
+        name+= f" - {suffix}"
+    return name

src/lookups.py ADDED Viewed

	@@ -0,0 +1,72 @@

+language_list = ['swh', 'eng',  'spa','fra','por','ron', 'fas', 'lin', 'mos','rus',
+                 #'ara','fas','ukr','tur', 'mya', 'rus',
+                #'kmr-script_latin', 'urd-script_arabic', 'urd-script_devanagari', 'urd-script_latin',
+                ]
+#####################################
+placeholders = {
+    'swh': "Mfuko wa Kimataifa wa Watoto",
+    'eng': "the United Nations International Children's Emergency Fund",
+    'spa': "El Fondo de las Naciones Unidas para la Infancia",
+    'fra': "Le Fonds des Nations unies pour l'enfance",
+    'por': "O Fundo das Nações Unidas para a Infância",
+    'ron': "Fondul Internațional pentru Urgențe ale Copiilor al Națiunilor Unite",
+    'fas': "صندوق کودکان ملل متحد",
+    'lin': 'Your phrase here',
+    'mos': 'Your phrase here',
+    'rus': 'Международного фонда помощи детям' ###
+}
+#####################################
+models = {
+'swh': {
+    'mms': 'facebook/mms-tts-swh',
+    'coqui': None,
+    'espeakng': 'sw',
+},
+'eng': {
+    'mms': 'facebook/mms-tts-eng',
+    'coqui': None,
+    'espeakng': 'en',
+},
+'spa':{
+    'mms': 'facebook/mms-tts-spa',
+    'coqui': 'tts_models/es/css10/vits',
+    'espeakng': 'es-419',
+},
+'fra':{
+    'mms': 'facebook/mms-tts-fra',
+    'coqui': 'tts_models/fr/css10/vits',
+    'espeakng': 'fr',
+},
+'por':{
+    'mms': 'facebook/mms-tts-por',
+    'coqui': 'tts_models/pt/cv/vits',
+    'espeakng': 'pt-br',
+},
+'ron':{
+    'mms': 'facebook/mms-tts-ron',
+    'coqui': 'tts_models/ro/cv/vits',
+    'espeakng': 'ro',
+},
+'fas':{
+    'mms': 'facebook/mms-tts-fas',
+    'coqui': None, #'tts_models/fa/custom/glow-tts',
+    'espeakng': 'fa',
+}  ,
+'lin':{
+    'mms': None,
+    'coqui': 'tts_models/lin/openbible/vits',
+    'espeakng': None,
+},
+'mos':{
+    'mms': 'facebook/mms-tts-mos',
+    'coqui': None,
+    'espeakng': None,
+}  ,
+'rus':{
+    'mms': 'facebook/mms-tts-rus',
+    'coqui': None,
+    'espeakng': 'ru',
+}
+}

src/synthesize.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import IPython
+from huggingface_hub.inference_api import InferenceApi
+import torch
+from TTS.api import TTS
+import wave
+from espeakng import ESpeakNG
+import subprocess
+from scipy.io import wavfile
+from transformers import pipeline
+import os
+def synth_mms(text:str, model:str):
+    '''
+    Use Huggingface inference pipeline to synthesize text.
+    (Can be replaced by inference API, but that requires stored API token.)
+    Inputs:
+        text: Text to synthesze
+        model: Model code of the form mms-tts-LAN
+    Returns:
+        Streaming numpy and sampling rate.
+    '''
+    #inference = InferenceApi(repo_id=f"facebook/{model}",
+    #                         token=API_TOKEN)
+    #mms_tts = inference(inputs=text,
+    #                    raw_response=True)._content
+    if model is not None:
+        pipe = pipeline("text-to-speech", model=model, device=-1) # Change device if it should use GPU
+        mms_tts = pipe(text)
+        return mms_tts['audio'], mms_tts['sampling_rate']
+    else:
+        return None
+def synth_coqui(text:str, model:str):
+    '''
+    Use Coqui inference API to synthesize text.
+    Inputs:
+        text: Text to synthesze
+        model: Model code
+    Returns:
+        Streaming Wav and sampling rate.
+    '''
+    if model is not None:
+        # Get device
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Init TTS
+        tts = TTS(model, progress_bar=False).to(device)
+        tts.tts_to_file(text=text, file_path="test.wav", is_multi_speaker=False)
+        sampling_rate, wav = wavfile.read('test.wav')
+        os.remove("test.wav")
+        #wav = tts.tts(text=text)
+        return wav, sampling_rate
+    else:
+        return None
+def synth_espeakng(text:str, model:str):
+    '''
+    Use ESpeak-NG to synthesize text.
+    Inputs:
+        text: Text to synthesze
+        model: Model code
+    Returns:
+        Streaming Wav and sampling rate.
+    '''
+    if model is not None:
+        subprocess.run(['espeak-ng', f'-v{model}', "-w test.wav", text]).returncode
+        sampling_rate, wav = wavfile.read('test.wav')
+        os.remove("test.wav")
+        #wav = tts.tts(text=text)
+        return wav, sampling_rate
+    else:
+        return None

target_speaker.wav ADDED Viewed

Binary file (51.5 kB). View file