frogcho123 commited on
Commit
05fd694
1 Parent(s): 589e047

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -27
app.py CHANGED
@@ -1,60 +1,46 @@
1
- import gradio as gr
2
  import os
 
3
  import whisper
4
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
  from gtts import gTTS
6
- import IPython.display as ipd
7
- import numpy as np
8
 
9
  # Load Whisper STT model
10
- whisper_model = whisper.load_model("base")
11
 
12
  # Load translation models
13
  tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
14
  model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
15
 
16
- def translate_speech(audio, target_lang):
17
- # Save audio as a temporary file
18
- audio_path = "recorded_audio.wav"
19
- with open(audio_path, "wb") as f:
20
- f.write(audio.tobytes())
21
-
22
- # Load audio
23
- audio = whisper.load_audio(audio_path)
24
  audio = whisper.pad_or_trim(audio)
25
  mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
26
-
27
- # Detect language
28
  _, probs = whisper_model.detect_language(mel)
29
- lang = max(probs, key=probs.get)
30
-
31
- # Decode audio into text
32
- options = whisper.DecodingOptions()
33
  result = whisper.decode(whisper_model, mel, options)
34
  text = result.text
35
 
36
  # Translate text
37
- tokenizer.src_lang = lang
38
  encoded_text = tokenizer(text, return_tensors="pt")
39
  generated_tokens = model.generate(**encoded_text)
40
  translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
41
 
42
  # Text-to-speech (TTS)
43
- tts = gTTS(text=translated_text, lang=target_lang)
44
  audio_path = "translated_audio.mp3"
45
  tts.save(audio_path)
46
 
47
  return audio_path
48
 
49
- def translate_speech_interface(audio, target_lang):
50
- translated_audio = translate_speech(audio, target_lang)
51
- translated_audio = open(translated_audio, "rb").read()
52
 
53
- return translated_audio
54
 
55
- # Define the Gradio interface
56
  audio_recording = gr.inputs.Audio(source="microphone", type="numpy", label="Record your speech")
57
- target_language = gr.inputs.Dropdown(["en", "ru", "fr"], label="Target Language")
58
  output_audio = gr.outputs.Audio(type="numpy", label="Translated Audio")
59
 
60
- gr.Interface(fn=translate_speech_interface, inputs=[audio_recording, target_language], outputs=output_audio, title="Speech Translator").launch()
 
 
 
1
  import os
2
+ import gradio as gr
3
  import whisper
4
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
5
  from gtts import gTTS
 
 
6
 
7
  # Load Whisper STT model
8
+ whisper_model = whisper.load_model("small100")
9
 
10
  # Load translation models
11
  tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
12
  model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
13
 
14
+ def translate_speech(audio):
15
+ audio = audio[0]
 
 
 
 
 
 
16
  audio = whisper.pad_or_trim(audio)
17
  mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
 
 
18
  _, probs = whisper_model.detect_language(mel)
19
+ options = whisper.DecodingOptions(fp16=False)
 
 
 
20
  result = whisper.decode(whisper_model, mel, options)
21
  text = result.text
22
 
23
  # Translate text
24
+ tokenizer.src_lang = 'en' # Assuming the input is always in English
25
  encoded_text = tokenizer(text, return_tensors="pt")
26
  generated_tokens = model.generate(**encoded_text)
27
  translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
28
 
29
  # Text-to-speech (TTS)
30
+ tts = gTTS(text=translated_text, lang='en') # Assuming the target language is English
31
  audio_path = "translated_audio.mp3"
32
  tts.save(audio_path)
33
 
34
  return audio_path
35
 
36
+ def translate_speech_interface(audio):
37
+ translated_audio = translate_speech(audio)
38
+ translated_audio_bytes = open(translated_audio, "rb").read()
39
 
40
+ return translated_audio_bytes
41
 
 
42
  audio_recording = gr.inputs.Audio(source="microphone", type="numpy", label="Record your speech")
 
43
  output_audio = gr.outputs.Audio(type="numpy", label="Translated Audio")
44
 
45
+ iface = gr.Interface(fn=translate_speech_interface, inputs=audio_recording, outputs=output_audio, title="Speech Translator")
46
+ iface.launch()