frogcho123 commited on
Commit
dcb549e
1 Parent(s): 341a129

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -9
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  import gradio as gr
3
- import numpy as np
4
  import whisper
5
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
6
  from gtts import gTTS
@@ -13,12 +13,9 @@ tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
13
  model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
14
 
15
  def translate_speech(audio, target_lang):
16
- if isinstance(audio, tuple):
17
- audio = audio[0]
18
- if isinstance(audio, int):
19
- audio = [audio]
20
- audio = np.array(audio).astype("float32") # Convert audio to float32
21
- audio = whisper.pad_or_trim(audio, whisper_model.audio_config.sample_rate)
22
  mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
23
  _, probs = whisper_model.detect_language(mel)
24
  options = whisper.DecodingOptions(fp16=False)
@@ -39,8 +36,6 @@ def translate_speech(audio, target_lang):
39
  return audio_path
40
 
41
 
42
-
43
-
44
  def translate_speech_interface(audio, target_lang):
45
  translated_audio = translate_speech(audio, target_lang)
46
  translated_audio_bytes = open(translated_audio, "rb").read()
 
1
  import os
2
  import gradio as gr
3
+ import numpy as np
4
  import whisper
5
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
6
  from gtts import gTTS
 
13
  model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
14
 
15
  def translate_speech(audio, target_lang):
16
+ audio = audio[0].astype("float32") # Extract audio from tuple and convert to float32
17
+ sample_rate = whisper.sample_rate # Get sample rate from whisper_model
18
+ audio = whisper.pad_or_trim(audio, sample_rate)
 
 
 
19
  mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
20
  _, probs = whisper_model.detect_language(mel)
21
  options = whisper.DecodingOptions(fp16=False)
 
36
  return audio_path
37
 
38
 
 
 
39
  def translate_speech_interface(audio, target_lang):
40
  translated_audio = translate_speech(audio, target_lang)
41
  translated_audio_bytes = open(translated_audio, "rb").read()