Spaces:
Running
Running
Irpan
commited on
Commit
•
1dfec92
1
Parent(s):
8377a77
asr
Browse files
app.py
CHANGED
@@ -13,13 +13,16 @@ mms_transcribe = gr.Interface(
|
|
13 |
label="Select Model for ASR",
|
14 |
value="ixxan/wav2vec2-large-mms-1b-uyghur-latin",
|
15 |
interactive=True
|
16 |
-
)
|
|
|
|
|
|
|
|
|
17 |
],
|
18 |
-
outputs="text",
|
19 |
#examples=ASR_EXAMPLES,
|
20 |
title="Speech-to-text",
|
21 |
description=(
|
22 |
-
"Transcribe audio from a microphone or input file."
|
23 |
),
|
24 |
#article=ASR_NOTE,
|
25 |
allow_flagging="never",
|
@@ -29,7 +32,7 @@ mms_synthesize = gr.Interface(
|
|
29 |
fn=tts.synthesize,
|
30 |
inputs=[
|
31 |
gr.Text(label="Input text"),
|
32 |
-
|
33 |
choices=[model for model in tts.models_info],
|
34 |
label="Select Model for TTS",
|
35 |
value="Meta-MMS",
|
|
|
13 |
label="Select Model for ASR",
|
14 |
value="ixxan/wav2vec2-large-mms-1b-uyghur-latin",
|
15 |
interactive=True
|
16 |
+
),
|
17 |
+
],
|
18 |
+
outputs=[
|
19 |
+
gr.Textbox(label="Uyghur Arabic Transcription"),
|
20 |
+
gr.Textbox(label="Uyghur Latin Transcription"),
|
21 |
],
|
|
|
22 |
#examples=ASR_EXAMPLES,
|
23 |
title="Speech-to-text",
|
24 |
description=(
|
25 |
+
"Transcribe Uyghur speech audio from a microphone or input file."
|
26 |
),
|
27 |
#article=ASR_NOTE,
|
28 |
allow_flagging="never",
|
|
|
32 |
fn=tts.synthesize,
|
33 |
inputs=[
|
34 |
gr.Text(label="Input text"),
|
35 |
+
gr.Dropdown(
|
36 |
choices=[model for model in tts.models_info],
|
37 |
label="Select Model for TTS",
|
38 |
value="Meta-MMS",
|
asr.py
CHANGED
@@ -9,49 +9,55 @@ from transformers import (
|
|
9 |
Wav2Vec2ForCTC
|
10 |
)
|
11 |
import numpy as np
|
|
|
12 |
|
13 |
# Load processor and model
|
14 |
models_info = {
|
15 |
"OpenAI-Whisper-Uzbek": {
|
16 |
"processor": WhisperProcessor.from_pretrained("openai/whisper-small", language="uzbek", task="transcribe"),
|
17 |
"model": AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small"),
|
18 |
-
"ctc_model": False
|
|
|
19 |
},
|
20 |
"ixxan/whisper-small-thugy20": {
|
21 |
"processor": AutoProcessor.from_pretrained("ixxan/whisper-small-thugy20"),
|
22 |
"model": AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-thugy20"),
|
23 |
-
"ctc_model": False
|
|
|
24 |
},
|
25 |
"ixxan/whisper-small-uyghur-common-voice": {
|
26 |
"processor": AutoProcessor.from_pretrained("ixxan/whisper-small-uyghur-common-voice"),
|
27 |
"model": AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-uyghur-common-voice"),
|
28 |
-
"ctc_model": False
|
|
|
29 |
},
|
30 |
"Meta-MMS": {
|
31 |
"processor": AutoProcessor.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic'),
|
32 |
"model": AutoModelForCTC.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic', ignore_mismatched_sizes=True),
|
33 |
-
"ctc_model": True
|
|
|
34 |
},
|
35 |
"ixxan/wav2vec2-large-mms-1b-uyghur-latin": {
|
36 |
"processor": Wav2Vec2Processor.from_pretrained("ixxan/wav2vec2-large-mms-1b-uyghur-latin", target_lang='uig-script_latin'),
|
37 |
"model": Wav2Vec2ForCTC.from_pretrained("ixxan/wav2vec2-large-mms-1b-uyghur-latin", target_lang='uig-script_latin'),
|
38 |
-
"ctc_model": True
|
|
|
39 |
},
|
40 |
}
|
41 |
|
42 |
-
def transcribe(audio_data, model_id) -> str:
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
|
48 |
-
def transcribe_all_models(audio_data) -> dict:
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
|
54 |
-
def
|
55 |
# Load audio file
|
56 |
if not audio_data:
|
57 |
return "<<ERROR: Empty Audio Input>>"
|
@@ -97,4 +103,10 @@ def transcribe_with_model(audio_data, model_id) -> str:
|
|
97 |
generated_ids = model.generate(inputs["input_features"], max_length=225)
|
98 |
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
99 |
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
Wav2Vec2ForCTC
|
10 |
)
|
11 |
import numpy as np
|
12 |
+
import util
|
13 |
|
14 |
# Load processor and model
|
15 |
models_info = {
|
16 |
"OpenAI-Whisper-Uzbek": {
|
17 |
"processor": WhisperProcessor.from_pretrained("openai/whisper-small", language="uzbek", task="transcribe"),
|
18 |
"model": AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small"),
|
19 |
+
"ctc_model": False,
|
20 |
+
"arabic_script": False
|
21 |
},
|
22 |
"ixxan/whisper-small-thugy20": {
|
23 |
"processor": AutoProcessor.from_pretrained("ixxan/whisper-small-thugy20"),
|
24 |
"model": AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-thugy20"),
|
25 |
+
"ctc_model": False,
|
26 |
+
"arabic_script": False
|
27 |
},
|
28 |
"ixxan/whisper-small-uyghur-common-voice": {
|
29 |
"processor": AutoProcessor.from_pretrained("ixxan/whisper-small-uyghur-common-voice"),
|
30 |
"model": AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-uyghur-common-voice"),
|
31 |
+
"ctc_model": False,
|
32 |
+
"arabic_script": False
|
33 |
},
|
34 |
"Meta-MMS": {
|
35 |
"processor": AutoProcessor.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic'),
|
36 |
"model": AutoModelForCTC.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic', ignore_mismatched_sizes=True),
|
37 |
+
"ctc_model": True,
|
38 |
+
"arabic_script": True
|
39 |
},
|
40 |
"ixxan/wav2vec2-large-mms-1b-uyghur-latin": {
|
41 |
"processor": Wav2Vec2Processor.from_pretrained("ixxan/wav2vec2-large-mms-1b-uyghur-latin", target_lang='uig-script_latin'),
|
42 |
"model": Wav2Vec2ForCTC.from_pretrained("ixxan/wav2vec2-large-mms-1b-uyghur-latin", target_lang='uig-script_latin'),
|
43 |
+
"ctc_model": True,
|
44 |
+
"arabic_script": False
|
45 |
},
|
46 |
}
|
47 |
|
48 |
+
# def transcribe(audio_data, model_id) -> str:
|
49 |
+
# if model_id == "Compare All Models":
|
50 |
+
# return transcribe_all_models(audio_data)
|
51 |
+
# else:
|
52 |
+
# return transcribe_with_model(audio_data, model_id)
|
53 |
|
54 |
+
# def transcribe_all_models(audio_data) -> dict:
|
55 |
+
# transcriptions = {}
|
56 |
+
# for model_id in models_info.keys():
|
57 |
+
# transcriptions[model_id] = transcribe_with_model(audio_data, model_id)
|
58 |
+
# return transcriptions
|
59 |
|
60 |
+
def transcribe(audio_data, model_id) -> str:
|
61 |
# Load audio file
|
62 |
if not audio_data:
|
63 |
return "<<ERROR: Empty Audio Input>>"
|
|
|
103 |
generated_ids = model.generate(inputs["input_features"], max_length=225)
|
104 |
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
105 |
|
106 |
+
if models_info[model_id]["arabic_script"]:
|
107 |
+
transcription_arabic = transcription
|
108 |
+
transcription_latin = util.ug_arab_to_latn(transcription)
|
109 |
+
else: # Latin script output
|
110 |
+
transcription_arabic = util.ug_latn_to_arab(transcription)
|
111 |
+
transcription_latin = transcription
|
112 |
+
return transcription_arabic, transcription_latin
|
tts.py
CHANGED
@@ -1,17 +1,21 @@
|
|
1 |
from transformers import VitsModel, AutoTokenizer
|
2 |
import torch
|
3 |
import scipy.io.wavfile
|
|
|
4 |
|
5 |
# Load processor and model
|
6 |
models_info = {
|
7 |
"Meta-MMS": {
|
8 |
"processor": AutoTokenizer.from_pretrained("facebook/mms-tts-uig-script_arabic"),
|
9 |
"model": VitsModel.from_pretrained("facebook/mms-tts-uig-script_arabic"),
|
|
|
10 |
},
|
11 |
}
|
12 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
13 |
|
14 |
def synthesize(text, model_id):
|
|
|
|
|
15 |
processor = models_info[model_id]["processor"]
|
16 |
model = models_info[model_id]["model"].to(device)
|
17 |
inputs = processor(text, return_tensors="pt").to(device)
|
@@ -20,7 +24,7 @@ def synthesize(text, model_id):
|
|
20 |
output = model(**inputs).waveform.cpu() # Move output back to CPU for saving
|
21 |
|
22 |
output_path = "tts_output.wav"
|
23 |
-
sample_rate =
|
24 |
scipy.io.wavfile.write(output_path, rate=sample_rate, data=output.numpy()[0])
|
25 |
|
26 |
return output_path
|
|
|
1 |
from transformers import VitsModel, AutoTokenizer
|
2 |
import torch
|
3 |
import scipy.io.wavfile
|
4 |
+
import util
|
5 |
|
6 |
# Load processor and model
|
7 |
models_info = {
|
8 |
"Meta-MMS": {
|
9 |
"processor": AutoTokenizer.from_pretrained("facebook/mms-tts-uig-script_arabic"),
|
10 |
"model": VitsModel.from_pretrained("facebook/mms-tts-uig-script_arabic"),
|
11 |
+
"arabic_script": True
|
12 |
},
|
13 |
}
|
14 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
15 |
|
16 |
def synthesize(text, model_id):
|
17 |
+
if models_info[model_id]["arabic_script"]:
|
18 |
+
text = util.ug_latn_to_arab(text)
|
19 |
processor = models_info[model_id]["processor"]
|
20 |
model = models_info[model_id]["model"].to(device)
|
21 |
inputs = processor(text, return_tensors="pt").to(device)
|
|
|
24 |
output = model(**inputs).waveform.cpu() # Move output back to CPU for saving
|
25 |
|
26 |
output_path = "tts_output.wav"
|
27 |
+
sample_rate = model.config.sample_rate
|
28 |
scipy.io.wavfile.write(output_path, rate=sample_rate, data=output.numpy()[0])
|
29 |
|
30 |
return output_path
|
util.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from umsc import UgMultiScriptConverter
|
2 |
+
|
3 |
+
# Initialize uyghur script converter
|
4 |
+
ug_arab_to_latn = UgMultiScriptConverter('UAS', 'ULS')
|
5 |
+
ug_latn_to_arab = UgMultiScriptConverter('ULS', 'UAS')
|