Spaces:
Runtime error
Runtime error
File size: 9,530 Bytes
9439387 dbf2fc2 d0bbc40 dbf2fc2 f0380ff dbf2fc2 f4b3d1b f200d27 028ff01 f09c038 dbf2fc2 a1f131a a4db77a a1f131a f09c038 f4b3d1b f09c038 a1f131a f4b3d1b a1f131a 4639cf2 a1f131a 2915c9d dbf2fc2 f4b3d1b dbf2fc2 bb12448 ff08b05 f4b3d1b 8ee61a8 dbf2fc2 028ff01 dbf2fc2 5915225 dbf2fc2 f4b3d1b a1f131a 028ff01 ac40f21 028ff01 dbf2fc2 f4b3d1b a1f131a f4b3d1b 028ff01 dbf2fc2 f09c038 dbf2fc2 a4db77a dbf2fc2 f4b3d1b dbf2fc2 273ae2e dbf2fc2 f4b3d1b dbf2fc2 a1f131a a4db77a a1f131a 273ae2e 5c2a535 273ae2e a1f131a a4db77a a1f131a dbf2fc2 a1f131a dbf2fc2 c1358b8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
from io import BytesIO
from typing import Tuple
import wave
import gradio as gr
import numpy as np
from pydub.audio_segment import AudioSegment
import requests
from os.path import exists
from stt import Model
import torch
from transformers import pipeline
import librosa
import torchaudio
from speechbrain.pretrained import EncoderClassifier
UI_STRINGS = {
"title": {
"es": "Reconocimiento de Dictado en Chatino, Mixteco, Totonaco y Español",
"en": "Speech recognition in Chatino, Mixtec, Totonac and Spanish",
},
"description": {
"es": "Una demo de identificar frases del español y de tres lenguas indígenas de México, y proveer el texto de cada una",
"en": "A demo of identifying phrases in Spanish and three Mexican indigenous languages, and providing transcripts of each",
},
"article": {
"es": "La identificación de lenguas usa el modelo"
" [lang-id-commonlanguage-ecapa de Speechbrain](https://huggingface.co/speechbrain/lang-id-commonlanguage_ecapa)"
" y aquí se supone que si la lengua no es español, debe ser la lengua indígena del contexto."
"\n\n"
"Chatino: Prueba de dictado a texto para el chatino de la sierra (Quiahije) "
" usando [el modelo entrenado por Bülent Özden](https://coqui.ai/chatino/bozden/v1.0.0)"
" con [los datos recopilados por Hilaria Cruz y sus colaboradores](https://gorilla.linguistlist.org/code/ctp/)."
"\n\n"
"Mixteco: Prueba de dictado a texto para el mixteco de Yoloxochitl,"
" usando [el modelo entrenado por Josh Meyer](https://coqui.ai/mixtec/jemeyer/v1.0.0/)"
" con [los datos recopilados por Rey Castillo y sus colaboradores](https://www.openslr.org/89)."
" \n\n"
"Totonaco: Prueba de dictado a texto para el totonaco de la sierra,"
" usando [el modelo entrenado por Bülent Özden](https://coqui.ai/totonac/bozden/v1.0.0)"
" con [los datos recopilados por Osbel López Francisco y sus colaboradores](https://www.openslr.org/107)."
" \n\n"
"Los ejemplos vienen del proyecto [DEMCA](https://demca.mesolex.org/) de Jonathan Amith. "
" Esta demo es basada en la de [Ukraniano](https://huggingface.co/spaces/robinhad/ukrainian-stt).",
"en": "The language identification uses the model"
" [lang-id-commonlanguage-ecapa from Speechbrain](https://huggingface.co/speechbrain/lang-id-commonlanguage_ecapa)"
" and here it is assumed that if the language is not Spanish, it must be the indigenous language of the context."
"\n\n"
"Chatino: Test of speech-to-text for Highland Chatino (Quiahije) "
" using [the model trained by Bülent Özden](https://coqui.ai/chatino/bozden/v1.0.0)"
" with [the data compiled by Hilaria Cruz and collaborators](https://gorilla.linguistlist.org/code/ctp/)."
"\n\n"
"Mixtec: Test of speech-to-text for Yoloxochitl Mixtec,"
" using [the model trained by Josh Meyer](https://coqui.ai/mixtec/jemeyer/v1.0.0/)"
" with [the data compiled by Rey Castillo and collaborators](https://www.openslr.org/89)."
"\n\n"
"Totonac: Test of speech-to-text for Highland Totonac,"
" using [the model trained by Bülent Özden](https://coqui.ai/chatino/bozden/v1.0.0)"
" with [the data compiled by Osbel López Francisco and collaborators](https://www.openslr.org/107)."
"\n\n"
"The examples come from the Jonathan Amith's [DEMCA](https://demca.mesolex.org/) project. "
" This demo is based on the one for [Ukrainian](https://huggingface.co/spaces/robinhad/ukrainian-stt).",
},
"languages": {
"mixteco": {
"es": "mixteco",
"en": "Mixtec",
},
"chatino": {
"es": "chatino",
"en": "Chatino",
},
"totonaco": {
"es": "totonaco",
"en": "Totonac",
},
"español": {
"es": "español",
"en": "Spanish",
},
"inglés": {
"es": "inglés",
"en": "English",
}
},
"labels": {
"target": {
"es": "Lengua principal",
"en": "Primary language",
},
"input": {
"es": "Audio",
"en": "Audio",
},
"output": {
"es": "Resultado",
"en": "Result",
}
}
}
# initialize language ID model
lang_classifier = EncoderClassifier.from_hparams(
source="speechbrain/lang-id-commonlanguage_ecapa",
savedir="pretrained_models/lang-id-commonlanguage_ecapa"
)
# download STT models
model_info = {
"mixteco": ("https://coqui.gateway.scarf.sh/mixtec/jemeyer/v1.0.0/model.tflite", "mixtec.tflite"),
"chatino": ("https://coqui.gateway.scarf.sh/chatino/bozden/v1.0.0/model.tflite", "chatino.tflite"),
"totonaco": ("https://coqui.gateway.scarf.sh/totonac/bozden/v1.0.0/model.tflite", "totonac.tflite"),
"español": ("jonatasgrosman/wav2vec2-large-xlsr-53-spanish", "spanish_xlsr"),
"inglés": ("facebook/wav2vec2-large-robust-ft-swbd-300h", "english_xlsr"),
}
def load_hf_model(model_path="facebook/wav2vec2-large-robust-ft-swbd-300h"):
return pipeline("automatic-speech-recognition", model=model_path)
def load_coqui_models(language):
model_path, file_name = model_info.get(language, ("", ""))
if not exists(file_name):
print(f"Downloading {model_path}")
r = requests.get(model_path, allow_redirects=True)
with open(file_name, 'wb') as file:
file.write(r.content)
else:
print(f"Found {file_name}. Skipping download...")
return Model(file_name)
STT_MODELS = {lang: load_hf_model(model_info[lang][0]) for lang in ("español",)}
for lang in ('mixteco', 'chatino', 'totonaco'):
STT_MODELS[lang] = load_coqui_models(lang)
def client(audio_data: np.array, sample_rate: int, default_lang: str):
output_audio = _convert_audio(audio_data, sample_rate)
waveform, _ = torchaudio.load(output_audio)
out_prob, score, index, text_lab = lang_classifier.classify_batch(waveform)
text_lab = text_lab[0]
output_audio.seek(0)
fin = wave.open(output_audio, 'rb')
coqui_audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)
output_audio.seek(0)
hf_audio, _ = librosa.load(output_audio)
fin.close()
print(default_lang, text_lab)
if text_lab == 'Spanish':
text_lab = UI_STRINGS["languages"]['español'][ui_language]
asr_pipeline = STT_MODELS['español']
result = asr_pipeline(hf_audio, chunk_length_s=5, stride_length_s=1)['text']
else:
text_lab = UI_STRINGS["languages"][default_lang][ui_language]
ds = STT_MODELS[default_lang]
result = ds.stt(coqui_audio)
return f"{text_lab}: {result}"
def stt(default_lang: str, audio: Tuple[int, np.array], state=None):
sample_rate, audio = audio
use_scorer = False
recognized_result = client(audio, sample_rate, default_lang)
return recognized_result, state
def _convert_audio(audio_data: np.array, sample_rate: int):
source_audio = BytesIO()
source_audio.write(audio_data)
source_audio.seek(0)
output_audio = BytesIO()
wav_file = AudioSegment.from_raw(
source_audio,
channels=1,
sample_width=2,
frame_rate=sample_rate
)
wav_file.set_frame_rate(16000).set_channels(1).export(output_audio, "wav", codec="pcm_s16le")
output_audio.seek(0)
return output_audio
def iface(ui_language):
return gr.Interface(
fn=stt,
inputs=[
gr.inputs.Radio(choices=("chatino", "mixteco", "totonaco"), default="mixteco", label=UI_STRINGS["labels"]["target"][ui_language]),
gr.inputs.Audio(type="numpy", label=UI_STRINGS["labels"]["input"][ui_language], source="microphone", optional=False),
gr.inputs.State(label="Resultado esperado")
],
outputs=[
gr.outputs.Textbox(label=UI_STRINGS["labels"]["output"][ui_language]),
gr.outputs.State(label="Resultado esperado")
],
title=UI_STRINGS["title"][ui_language],
theme="huggingface",
description=UI_STRINGS["description"][ui_language],
examples=[["mixteco", "ejemplos/espanol1-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav", "español: "],
["mixteco", "ejemplos/espanol2-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav", "español: "],
["mixteco", "ejemplos/mixteco1-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav", "mixteco: "],
["mixteco", "ejemplos/mixteco2-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav", "mixteco: "],
["totonaco", "ejemplos/totonaco1-Zongo_Botan_Acanthaceae-Justicia-spicigera_SLC388-IPN389_2018-07-26-i.wav", "totonaco: "],
["totonaco", "ejemplos/totonaco2-Zongo_Botan_Acanthaceae-Justicia-spicigera_SLC388-IPN389_2018-07-26-i.wav", "totonaco: "]],
article=UI_STRINGS["title"][ui_language],
)
es_iface = iface('es')
es_iface.launch()
|