File size: 9,530 Bytes
9439387
dbf2fc2
 
d0bbc40
dbf2fc2
 
 
 
f0380ff
dbf2fc2
f4b3d1b
f200d27
028ff01
f09c038
 
dbf2fc2
a1f131a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4db77a
a1f131a
 
 
 
 
 
f09c038
f4b3d1b
 
 
 
f09c038
 
a1f131a
f4b3d1b
 
 
 
 
 
 
 
a1f131a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4639cf2
a1f131a
 
2915c9d
dbf2fc2
f4b3d1b
dbf2fc2
bb12448
 
ff08b05
f4b3d1b
8ee61a8
dbf2fc2
028ff01
 
 
 
dbf2fc2
 
5915225
dbf2fc2
f4b3d1b
a1f131a
028ff01
ac40f21
028ff01
dbf2fc2
f4b3d1b
a1f131a
f4b3d1b
028ff01
dbf2fc2
f09c038
dbf2fc2
 
a4db77a
dbf2fc2
 
 
f4b3d1b
dbf2fc2
273ae2e
dbf2fc2
 
 
 
 
 
 
 
 
 
 
 
 
f4b3d1b
dbf2fc2
 
 
a1f131a
 
 
 
 
 
a4db77a
a1f131a
273ae2e
 
5c2a535
273ae2e
a1f131a
 
 
a4db77a
 
 
 
 
 
a1f131a
 
dbf2fc2
a1f131a
dbf2fc2
c1358b8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
from io import BytesIO   
from typing import Tuple
import wave
import gradio as gr 
import numpy as np
from pydub.audio_segment import AudioSegment
import requests
from os.path import exists
from stt import Model  

import torch
from transformers import pipeline
import librosa
import torchaudio
from speechbrain.pretrained import EncoderClassifier

UI_STRINGS = {
    "title": {
        "es": "Reconocimiento de Dictado en Chatino, Mixteco, Totonaco y Español",
        "en": "Speech recognition in Chatino, Mixtec, Totonac and Spanish",
    },
    "description": {
        "es": "Una demo de identificar frases del español y de tres lenguas indígenas de México, y proveer el texto de cada una",
        "en": "A demo of identifying phrases in Spanish and three Mexican indigenous languages, and providing transcripts of each",
    },
    "article": {
        "es":  "La identificación de lenguas usa el modelo"
                " [lang-id-commonlanguage-ecapa de Speechbrain](https://huggingface.co/speechbrain/lang-id-commonlanguage_ecapa)"
                " y aquí se supone que si la lengua no es español, debe ser la lengua indígena del contexto."
                "\n\n"
                "Chatino: Prueba de dictado a texto para el chatino de la sierra (Quiahije) "
                " usando [el modelo entrenado por Bülent Özden](https://coqui.ai/chatino/bozden/v1.0.0)"
                " con [los datos recopilados por Hilaria Cruz y sus colaboradores](https://gorilla.linguistlist.org/code/ctp/)."
                "\n\n"
                "Mixteco: Prueba de dictado a texto para el mixteco de Yoloxochitl,"
                " usando [el modelo entrenado por Josh Meyer](https://coqui.ai/mixtec/jemeyer/v1.0.0/)"
                " con [los datos recopilados por Rey Castillo y sus colaboradores](https://www.openslr.org/89)."
                " \n\n"
                "Totonaco: Prueba de dictado a texto para el totonaco de la sierra,"
                " usando [el modelo entrenado por Bülent Özden](https://coqui.ai/totonac/bozden/v1.0.0)"
                " con [los datos recopilados por Osbel López Francisco y sus colaboradores](https://www.openslr.org/107)."
                " \n\n"
                "Los ejemplos vienen del proyecto [DEMCA](https://demca.mesolex.org/) de Jonathan Amith. "
                " Esta demo es basada en la de [Ukraniano](https://huggingface.co/spaces/robinhad/ukrainian-stt).",
        "en": "The language identification uses the model"
                " [lang-id-commonlanguage-ecapa from Speechbrain](https://huggingface.co/speechbrain/lang-id-commonlanguage_ecapa)"
                " and here it is assumed that if the language is not Spanish, it must be the indigenous language of the context."
                "\n\n"
                "Chatino: Test of speech-to-text for Highland Chatino (Quiahije) "
                " using [the model trained by Bülent Özden](https://coqui.ai/chatino/bozden/v1.0.0)"
                " with [the data compiled by Hilaria Cruz and collaborators](https://gorilla.linguistlist.org/code/ctp/)."
                "\n\n"
                "Mixtec: Test of speech-to-text for Yoloxochitl Mixtec,"
                " using [the model trained by Josh Meyer](https://coqui.ai/mixtec/jemeyer/v1.0.0/)"
                " with [the data compiled by Rey Castillo and collaborators](https://www.openslr.org/89)."
                "\n\n"
                "Totonac: Test of speech-to-text for Highland Totonac,"
                " using [the model trained by Bülent Özden](https://coqui.ai/chatino/bozden/v1.0.0)"
                " with [the data compiled by Osbel López Francisco and collaborators](https://www.openslr.org/107)."
                "\n\n"
                "The examples come from the Jonathan Amith's [DEMCA](https://demca.mesolex.org/) project. "
                " This demo is based on the one for [Ukrainian](https://huggingface.co/spaces/robinhad/ukrainian-stt).",
    },
    "languages": {
        "mixteco": {
            "es": "mixteco",
            "en": "Mixtec",
        },
        "chatino": {
            "es": "chatino",
            "en": "Chatino",
        },
        "totonaco": {
            "es": "totonaco",
            "en": "Totonac",
        },
        "español": {
            "es": "español",
            "en": "Spanish",
        },
        "inglés": {
            "es": "inglés",
            "en": "English",
        }
    },
    "labels": {
        "target": {
            "es": "Lengua principal",
            "en": "Primary language",
        },
        "input": {
            "es": "Audio",
            "en": "Audio",
        },
        "output": {
            "es": "Resultado",
            "en": "Result",
        }
    }
}


# initialize language ID model
lang_classifier = EncoderClassifier.from_hparams(
    source="speechbrain/lang-id-commonlanguage_ecapa", 
    savedir="pretrained_models/lang-id-commonlanguage_ecapa"
)


# download STT models
model_info = {
    "mixteco": ("https://coqui.gateway.scarf.sh/mixtec/jemeyer/v1.0.0/model.tflite", "mixtec.tflite"),
    "chatino": ("https://coqui.gateway.scarf.sh/chatino/bozden/v1.0.0/model.tflite", "chatino.tflite"),
    "totonaco": ("https://coqui.gateway.scarf.sh/totonac/bozden/v1.0.0/model.tflite", "totonac.tflite"),
    "español": ("jonatasgrosman/wav2vec2-large-xlsr-53-spanish", "spanish_xlsr"),
    "inglés": ("facebook/wav2vec2-large-robust-ft-swbd-300h", "english_xlsr"),
}


def load_hf_model(model_path="facebook/wav2vec2-large-robust-ft-swbd-300h"):
    return pipeline("automatic-speech-recognition", model=model_path)


def load_coqui_models(language):

    model_path, file_name = model_info.get(language, ("", ""))

    if not exists(file_name):
        print(f"Downloading {model_path}")
        r = requests.get(model_path, allow_redirects=True)
        with open(file_name, 'wb') as file:
            file.write(r.content)
    else:
        print(f"Found {file_name}. Skipping download...")
    return Model(file_name)


STT_MODELS = {lang: load_hf_model(model_info[lang][0]) for lang in ("español",)}
for lang in ('mixteco', 'chatino', 'totonaco'):
    STT_MODELS[lang] = load_coqui_models(lang)


def client(audio_data: np.array, sample_rate: int, default_lang: str):
    output_audio = _convert_audio(audio_data, sample_rate)
    waveform, _ = torchaudio.load(output_audio)
    out_prob, score, index, text_lab = lang_classifier.classify_batch(waveform)
    text_lab = text_lab[0]

    output_audio.seek(0)
    fin = wave.open(output_audio, 'rb')
    coqui_audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16)

    output_audio.seek(0)
    hf_audio, _ = librosa.load(output_audio)

    fin.close()
    print(default_lang, text_lab)

    if text_lab == 'Spanish':
        text_lab = UI_STRINGS["languages"]['español'][ui_language]

        asr_pipeline = STT_MODELS['español']
        result = asr_pipeline(hf_audio, chunk_length_s=5, stride_length_s=1)['text']

    else:
        text_lab = UI_STRINGS["languages"][default_lang][ui_language]
        ds = STT_MODELS[default_lang]
        result = ds.stt(coqui_audio)

    return f"{text_lab}: {result}"


def stt(default_lang: str, audio: Tuple[int, np.array], state=None):
    sample_rate, audio = audio
    use_scorer = False

    recognized_result = client(audio, sample_rate, default_lang)

    return recognized_result, state


def _convert_audio(audio_data: np.array, sample_rate: int):
    source_audio = BytesIO()
    source_audio.write(audio_data)
    source_audio.seek(0)
    output_audio = BytesIO()
    wav_file = AudioSegment.from_raw(
        source_audio,
        channels=1,
        sample_width=2,
        frame_rate=sample_rate
    )
    wav_file.set_frame_rate(16000).set_channels(1).export(output_audio, "wav", codec="pcm_s16le")
    output_audio.seek(0)
    return output_audio

def iface(ui_language):
    return gr.Interface(
        fn=stt,
        inputs=[
            gr.inputs.Radio(choices=("chatino", "mixteco", "totonaco"), default="mixteco", label=UI_STRINGS["labels"]["target"][ui_language]),
            gr.inputs.Audio(type="numpy", label=UI_STRINGS["labels"]["input"][ui_language], source="microphone", optional=False),
            gr.inputs.State(label="Resultado esperado")
        ],
        outputs=[
            gr.outputs.Textbox(label=UI_STRINGS["labels"]["output"][ui_language]),
            gr.outputs.State(label="Resultado esperado")
        ],
        title=UI_STRINGS["title"][ui_language],
        theme="huggingface",
        description=UI_STRINGS["description"][ui_language],
        examples=[["mixteco", "ejemplos/espanol1-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav", "español: "],
                ["mixteco", "ejemplos/espanol2-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav", "español: "],
                ["mixteco", "ejemplos/mixteco1-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav", "mixteco: "],
                ["mixteco", "ejemplos/mixteco2-Yolox_BotFl_CTB501-FEF537-EGS503_40202-Acanthaceae-Ruellia_2017-01-05-h.wav", "mixteco: "],
                ["totonaco", "ejemplos/totonaco1-Zongo_Botan_Acanthaceae-Justicia-spicigera_SLC388-IPN389_2018-07-26-i.wav", "totonaco: "],
                ["totonaco", "ejemplos/totonaco2-Zongo_Botan_Acanthaceae-Justicia-spicigera_SLC388-IPN389_2018-07-26-i.wav", "totonaco: "]],
        article=UI_STRINGS["title"][ui_language],
    )

es_iface = iface('es')

es_iface.launch()