Irpan commited on
Commit
1dfec92
1 Parent(s): 8377a77
Files changed (4) hide show
  1. app.py +7 -4
  2. asr.py +29 -17
  3. tts.py +5 -1
  4. util.py +5 -0
app.py CHANGED
@@ -13,13 +13,16 @@ mms_transcribe = gr.Interface(
13
  label="Select Model for ASR",
14
  value="ixxan/wav2vec2-large-mms-1b-uyghur-latin",
15
  interactive=True
16
- )
 
 
 
 
17
  ],
18
- outputs="text",
19
  #examples=ASR_EXAMPLES,
20
  title="Speech-to-text",
21
  description=(
22
- "Transcribe audio from a microphone or input file."
23
  ),
24
  #article=ASR_NOTE,
25
  allow_flagging="never",
@@ -29,7 +32,7 @@ mms_synthesize = gr.Interface(
29
  fn=tts.synthesize,
30
  inputs=[
31
  gr.Text(label="Input text"),
32
- gr.Dropdown(
33
  choices=[model for model in tts.models_info],
34
  label="Select Model for TTS",
35
  value="Meta-MMS",
 
13
  label="Select Model for ASR",
14
  value="ixxan/wav2vec2-large-mms-1b-uyghur-latin",
15
  interactive=True
16
+ ),
17
+ ],
18
+ outputs=[
19
+ gr.Textbox(label="Uyghur Arabic Transcription"),
20
+ gr.Textbox(label="Uyghur Latin Transcription"),
21
  ],
 
22
  #examples=ASR_EXAMPLES,
23
  title="Speech-to-text",
24
  description=(
25
+ "Transcribe Uyghur speech audio from a microphone or input file."
26
  ),
27
  #article=ASR_NOTE,
28
  allow_flagging="never",
 
32
  fn=tts.synthesize,
33
  inputs=[
34
  gr.Text(label="Input text"),
35
+ gr.Dropdown(
36
  choices=[model for model in tts.models_info],
37
  label="Select Model for TTS",
38
  value="Meta-MMS",
asr.py CHANGED
@@ -9,49 +9,55 @@ from transformers import (
9
  Wav2Vec2ForCTC
10
  )
11
  import numpy as np
 
12
 
13
  # Load processor and model
14
  models_info = {
15
  "OpenAI-Whisper-Uzbek": {
16
  "processor": WhisperProcessor.from_pretrained("openai/whisper-small", language="uzbek", task="transcribe"),
17
  "model": AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small"),
18
- "ctc_model": False
 
19
  },
20
  "ixxan/whisper-small-thugy20": {
21
  "processor": AutoProcessor.from_pretrained("ixxan/whisper-small-thugy20"),
22
  "model": AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-thugy20"),
23
- "ctc_model": False
 
24
  },
25
  "ixxan/whisper-small-uyghur-common-voice": {
26
  "processor": AutoProcessor.from_pretrained("ixxan/whisper-small-uyghur-common-voice"),
27
  "model": AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-uyghur-common-voice"),
28
- "ctc_model": False
 
29
  },
30
  "Meta-MMS": {
31
  "processor": AutoProcessor.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic'),
32
  "model": AutoModelForCTC.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic', ignore_mismatched_sizes=True),
33
- "ctc_model": True
 
34
  },
35
  "ixxan/wav2vec2-large-mms-1b-uyghur-latin": {
36
  "processor": Wav2Vec2Processor.from_pretrained("ixxan/wav2vec2-large-mms-1b-uyghur-latin", target_lang='uig-script_latin'),
37
  "model": Wav2Vec2ForCTC.from_pretrained("ixxan/wav2vec2-large-mms-1b-uyghur-latin", target_lang='uig-script_latin'),
38
- "ctc_model": True
 
39
  },
40
  }
41
 
42
- def transcribe(audio_data, model_id) -> str:
43
- if model_id == "Compare All Models":
44
- return transcribe_all_models(audio_data)
45
- else:
46
- return transcribe_with_model(audio_data, model_id)
47
 
48
- def transcribe_all_models(audio_data) -> dict:
49
- transcriptions = {}
50
- for model_id in models_info.keys():
51
- transcriptions[model_id] = transcribe_with_model(audio_data, model_id)
52
- return transcriptions
53
 
54
- def transcribe_with_model(audio_data, model_id) -> str:
55
  # Load audio file
56
  if not audio_data:
57
  return "<<ERROR: Empty Audio Input>>"
@@ -97,4 +103,10 @@ def transcribe_with_model(audio_data, model_id) -> str:
97
  generated_ids = model.generate(inputs["input_features"], max_length=225)
98
  transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
99
 
100
- return transcription
 
 
 
 
 
 
 
9
  Wav2Vec2ForCTC
10
  )
11
  import numpy as np
12
+ import util
13
 
14
  # Load processor and model
15
  models_info = {
16
  "OpenAI-Whisper-Uzbek": {
17
  "processor": WhisperProcessor.from_pretrained("openai/whisper-small", language="uzbek", task="transcribe"),
18
  "model": AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small"),
19
+ "ctc_model": False,
20
+ "arabic_script": False
21
  },
22
  "ixxan/whisper-small-thugy20": {
23
  "processor": AutoProcessor.from_pretrained("ixxan/whisper-small-thugy20"),
24
  "model": AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-thugy20"),
25
+ "ctc_model": False,
26
+ "arabic_script": False
27
  },
28
  "ixxan/whisper-small-uyghur-common-voice": {
29
  "processor": AutoProcessor.from_pretrained("ixxan/whisper-small-uyghur-common-voice"),
30
  "model": AutoModelForSpeechSeq2Seq.from_pretrained("ixxan/whisper-small-uyghur-common-voice"),
31
+ "ctc_model": False,
32
+ "arabic_script": False
33
  },
34
  "Meta-MMS": {
35
  "processor": AutoProcessor.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic'),
36
  "model": AutoModelForCTC.from_pretrained("facebook/mms-1b-all", target_lang='uig-script_arabic', ignore_mismatched_sizes=True),
37
+ "ctc_model": True,
38
+ "arabic_script": True
39
  },
40
  "ixxan/wav2vec2-large-mms-1b-uyghur-latin": {
41
  "processor": Wav2Vec2Processor.from_pretrained("ixxan/wav2vec2-large-mms-1b-uyghur-latin", target_lang='uig-script_latin'),
42
  "model": Wav2Vec2ForCTC.from_pretrained("ixxan/wav2vec2-large-mms-1b-uyghur-latin", target_lang='uig-script_latin'),
43
+ "ctc_model": True,
44
+ "arabic_script": False
45
  },
46
  }
47
 
48
+ # def transcribe(audio_data, model_id) -> str:
49
+ # if model_id == "Compare All Models":
50
+ # return transcribe_all_models(audio_data)
51
+ # else:
52
+ # return transcribe_with_model(audio_data, model_id)
53
 
54
+ # def transcribe_all_models(audio_data) -> dict:
55
+ # transcriptions = {}
56
+ # for model_id in models_info.keys():
57
+ # transcriptions[model_id] = transcribe_with_model(audio_data, model_id)
58
+ # return transcriptions
59
 
60
+ def transcribe(audio_data, model_id) -> str:
61
  # Load audio file
62
  if not audio_data:
63
  return "<<ERROR: Empty Audio Input>>"
 
103
  generated_ids = model.generate(inputs["input_features"], max_length=225)
104
  transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
105
 
106
+ if models_info[model_id]["arabic_script"]:
107
+ transcription_arabic = transcription
108
+ transcription_latin = util.ug_arab_to_latn(transcription)
109
+ else: # Latin script output
110
+ transcription_arabic = util.ug_latn_to_arab(transcription)
111
+ transcription_latin = transcription
112
+ return transcription_arabic, transcription_latin
tts.py CHANGED
@@ -1,17 +1,21 @@
1
  from transformers import VitsModel, AutoTokenizer
2
  import torch
3
  import scipy.io.wavfile
 
4
 
5
  # Load processor and model
6
  models_info = {
7
  "Meta-MMS": {
8
  "processor": AutoTokenizer.from_pretrained("facebook/mms-tts-uig-script_arabic"),
9
  "model": VitsModel.from_pretrained("facebook/mms-tts-uig-script_arabic"),
 
10
  },
11
  }
12
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
 
14
  def synthesize(text, model_id):
 
 
15
  processor = models_info[model_id]["processor"]
16
  model = models_info[model_id]["model"].to(device)
17
  inputs = processor(text, return_tensors="pt").to(device)
@@ -20,7 +24,7 @@ def synthesize(text, model_id):
20
  output = model(**inputs).waveform.cpu() # Move output back to CPU for saving
21
 
22
  output_path = "tts_output.wav"
23
- sample_rate = 16000
24
  scipy.io.wavfile.write(output_path, rate=sample_rate, data=output.numpy()[0])
25
 
26
  return output_path
 
1
  from transformers import VitsModel, AutoTokenizer
2
  import torch
3
  import scipy.io.wavfile
4
+ import util
5
 
6
  # Load processor and model
7
  models_info = {
8
  "Meta-MMS": {
9
  "processor": AutoTokenizer.from_pretrained("facebook/mms-tts-uig-script_arabic"),
10
  "model": VitsModel.from_pretrained("facebook/mms-tts-uig-script_arabic"),
11
+ "arabic_script": True
12
  },
13
  }
14
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
 
16
  def synthesize(text, model_id):
17
+ if models_info[model_id]["arabic_script"]:
18
+ text = util.ug_latn_to_arab(text)
19
  processor = models_info[model_id]["processor"]
20
  model = models_info[model_id]["model"].to(device)
21
  inputs = processor(text, return_tensors="pt").to(device)
 
24
  output = model(**inputs).waveform.cpu() # Move output back to CPU for saving
25
 
26
  output_path = "tts_output.wav"
27
+ sample_rate = model.config.sample_rate
28
  scipy.io.wavfile.write(output_path, rate=sample_rate, data=output.numpy()[0])
29
 
30
  return output_path
util.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from umsc import UgMultiScriptConverter
2
+
3
+ # Initialize uyghur script converter
4
+ ug_arab_to_latn = UgMultiScriptConverter('UAS', 'ULS')
5
+ ug_latn_to_arab = UgMultiScriptConverter('ULS', 'UAS')