Spaces:
Running
Running
import gradio as gr | |
LAST_UPDATED = "Nov 25th 2024" | |
#################################### | |
# Datos estáticos del leaderboard | |
#################################### | |
leaderboard_data = [ | |
{'name': 'StyleTTS 2', 'STOI': 0.998, 'PESQ': 3.921, 'WER': 0.162, 'UTMOS': 3.47}, | |
{'name': 'Matxa-TTS', 'STOI': 0.996, 'PESQ': 3.539, 'WER': 0.179, 'UTMOS': 3.50}, | |
{'name': 'Matxa-TTS-multiaccent', 'STOI': 0.996, 'PESQ': 3.415, 'WER': 0.242, 'UTMOS': 2.98}, | |
{'name': 'StableTTS', 'STOI': 0.997, 'PESQ': 3.643, 'WER': 0.164, 'UTMOS': 2.62}, | |
] | |
# Texto para la pestaña de métricas | |
METRICS_TAB_TEXT = """ | |
## Metrics | |
Models in the leaderboard are evaluated using several key metrics: | |
* **UTMOS** (UTokyo-SaruLab Mean Opinion Score), | |
* **WER** (Word Error Rate), | |
* **STOI** (Short-Time Objective Intelligibility), | |
* **PESQ** (Perceptual Evaluation of Speech Quality). | |
These metrics help evaluate both the accuracy and quality of the model. | |
### UTMOS (UTokyo-SaruLab Mean Opinion Score)[[Paper](https://arxiv.org/abs/2204.02152)] | |
UTMOS is a MOS prediction system. **A higher UTMOS indicates better quality** of the generated voice. | |
### WER (Word Error Rate) | |
WER is a common metric for evaluating speech recognition systems. It measures the percentage of words in the generated transcript that differ from the reference (correct) transcript. **A lower WER value indicates higher accuracy**. | |
Example: | |
| Reference | the | cat | sat | on | the | mat | | |
|-------------|------|-----|---------|-----|------|-----| | |
| Prediction | the | cat | **sit** | on | the | | | |
| Label | ✅ | ✅ | S | ✅ | ✅ | D | | |
The WER calculation is done as follows: | |
``` | |
WER = (S + I + D) / N = (1 + 0 + 1) / 6 = 0.333 | |
``` | |
### STOI (Short-Time Objective Intelligibility)[[Paper](https://ieeexplore.ieee.org/abstract/document/5495701?casa_token=PLtqLc8KNAgAAAAA:FOLuZ4dgMYsnGb1dQHgqVOouQzRJ3vA5yqj-sbwf8gs9Q-AIDCLkMZzAgzRrAogwwxULK9zsYeE)] | |
STOI measures the intelligibility of the synthesized speech signal compared to the original signal. **A higher STOI indicates better intelligibility**. | |
### PESQ (Perceptual Evaluation of Speech Quality)[[Paper](https://ieeexplore.ieee.org/abstract/document/941023?casa_token=jdtHy84_KhQAAAAA:qHN3WbT6cNdufj6OOn_fn0Je0RedMv-WJCmhQ_3CWy4nMTuDvFMF3KstAmKqLx5suQwdPgGByoY)] | |
PESQ is a perceptual metric that evaluates the quality of speech in a similar manner to how a human listener would. **A higher PESQ indicates better voice quality**. | |
## Benchmark Datasets | |
Model performance is evaluated using [our test datasets](https://huggingface.co/spaces/projecte-aina/catalan_tts_arena/blob/main/bsc.txt). These datasets cover a variety of domains and acoustic conditions, ensuring a robust evaluation. | |
""" | |
CITATION_TEXT = """@misc{catalan-tts-arena, | |
title = {Catalan Texto-to-Speech Leaderboard}, | |
author = {Rodolfo Zevallos, José Giraldo, Alex Peiró-Lilja, Carme Armentano-Oller}, | |
year = 2024, | |
publisher = {Hugging Face}, | |
howpublished = "\\url{https://huggingface.co/spaces/projecte-aina/catalan_tts_arena}" | |
} | |
""" | |
DESCR = """ | |
# 🏆 Catalan TTS Arena: Benchmarking TTS Models | |
\nThe Catalan TTS Leaderboard ranks and evaluates TTS models in Catalan. | |
\nThe leaderboard currently focuses on Catalan TTS, and will be expanded to multilingual evaluation in later versions. | |
""".strip() | |
#################################### | |
# Functions (static version) | |
#################################### | |
def get_leaderboard(): | |
""" | |
Retorna el leaderboard en orden descendente por PESQ y luego por UTMOS. | |
""" | |
# Ordenar primero por PESQ (calidad del habla) y luego por UTMOS (calidad percibida) | |
sorted_leaderboard = sorted(leaderboard_data, key=lambda x: (x['UTMOS']), reverse=True) | |
# Asignar el rank basado en el orden por PESQ | |
for rank, model in enumerate(sorted_leaderboard): | |
model['rank'] = rank + 1 # rank es la posición en la lista (1-indexed) | |
return [[model['rank'], model['name'], model['UTMOS'], model['WER'], model['STOI'], model['PESQ']] for model in sorted_leaderboard] | |
#################################### | |
# Interfaz con Gradio | |
#################################### | |
theme = gr.themes.Base( | |
font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'], | |
) | |
with gr.Blocks(theme=theme) as demo: | |
gr.Markdown(DESCR, elem_classes="markdown-text") | |
with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0): | |
leaderboard_table = gr.DataFrame( | |
headers=["Rank", "Model", "UTMOS ⬆️️", "WER ⬇️", "STOI", "PESQ"], | |
datatype=["str", "str", "str", "str", "str", "str"], | |
value=get_leaderboard() # Carga los datos iniciales de la tabla | |
) | |
with gr.TabItem("📈 Metrics", elem_id="od-benchmark-tab-table", id=1): | |
gr.Markdown(METRICS_TAB_TEXT, elem_classes="markdown-text") | |
gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text") | |
with gr.Row(): | |
with gr.Accordion("📙 Citation", open=False): | |
gr.Textbox( | |
value=CITATION_TEXT, lines=7, | |
label="Copy the BibTeX snippet to cite this source", | |
elem_id="citation-button", | |
show_copy_button=True, | |
) | |
# Lanzar la aplicación | |
demo.queue(api_open=False, default_concurrency_limit=40).launch(show_api=False) |