Baghdad99 commited on
Commit
5b74a4b
1 Parent(s): e3a6dbd
Files changed (1) hide show
  1. app.py +32 -32
app.py CHANGED
@@ -1,39 +1,39 @@
1
- import gradio
2
- import torch
3
- import numpy as np
4
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForTextToWaveform
5
-
6
- # Load your pretrained models
7
- asr_model = Wav2Vec2ForCTC.from_pretrained("Baghdad99/saad-speech-recognition-hausa-audio-to-text")
8
- asr_processor = Wav2Vec2Processor.from_pretrained("Baghdad99/saad-speech-recognition-hausa-audio-to-text")
9
- translation_tokenizer = AutoTokenizer.from_pretrained("Baghdad99/saad-hausa-text-to-english-text")
10
- translation_model = AutoModelForSeq2SeqLM.from_pretrained("Baghdad99/saad-hausa-text-to-english-text", from_tf=True)
11
- tts_tokenizer = AutoTokenizer.from_pretrained("Baghdad99/english_voice_tts")
12
- tts_model = AutoModelForTextToWaveform.from_pretrained("Baghdad99/english_voice_tts")
13
-
14
- # Modify the translate function to accept the sampling_rate argument
15
- def translate(audio_signal, sampling_rate):
16
- inputs = asr_processor(audio_signal, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
17
- logits = asr_model(inputs.input_values).logits
18
- predicted_ids = torch.argmax(logits, dim=-1)
19
- transcription = asr_processor.decode(predicted_ids[0])
20
- translated = translation_model.generate(**translation_tokenizer(transcription, return_tensors="pt", padding=True))
21
- translated_text = [translation_tokenizer.decode(t, skip_special_tokens=True) for t in translated]
22
- return translated_text
23
-
24
- def synthesise(translated_text):
25
- inputs = tts_tokenizer(translated_text, return_tensors='pt')
26
- audio = tts_model.generate(inputs['input_ids'])
27
- return audio
28
-
29
- def translate_speech(audio, sampling_rate):
30
- translated_text = translate(audio, sampling_rate=sampling_rate)
31
- synthesised_speech = synthesise(translated_text)
32
  # Define the max_range variable
33
  max_range = 32767 # You can adjust this value based on your requirements
34
  synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
 
35
  return 16000, synthesised_speech
36
 
37
  # Define the Gradio interface
38
- iface = gradio.Interface(fn=translate_speech, inputs=gradio.inputs.Audio(source="microphone", type="numpy"), outputs="audio")
 
 
 
 
 
 
 
39
  iface.launch()
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+
4
+ # Load the pipeline for speech recognition and translation
5
+ pipe = pipeline(
6
+ "automatic-speech-recognition",
7
+ model="Baghdad99/saad-speech-recognition-hausa-audio-to-text",
8
+ tokenizer="Baghdad99/saad-speech-recognition-hausa-audio-to-text"
9
+ )
10
+ translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
11
+ tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")
12
+
13
+ # Define the function to translate speech
14
+ def translate_speech(audio):
15
+ # Use the speech recognition pipeline to transcribe the audio
16
+ transcription = pipe(audio, sampling_rate=16000)[0]["transcription"]
17
+
18
+ # Use the translation pipeline to translate the transcription
19
+ translated_text = translator(transcription, return_tensors="pt", padding=True)
20
+
21
+ # Use the text-to-speech pipeline to synthesize the translated text
22
+ synthesised_speech = tts(translated_text, return_tensors='pt')
23
+
 
 
 
 
 
 
 
 
24
  # Define the max_range variable
25
  max_range = 32767 # You can adjust this value based on your requirements
26
  synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
27
+
28
  return 16000, synthesised_speech
29
 
30
  # Define the Gradio interface
31
+ iface = gr.Interface(
32
+ fn=translate_speech,
33
+ inputs=gr.inputs.Audio(source="microphone", type="numpy"),
34
+ outputs=gr.outputs.Audio(type="numpy"),
35
+ title="Hausa to English Translation",
36
+ description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
37
+ )
38
+
39
  iface.launch()