frogcho123 commited on
Commit
29135e4
1 Parent(s): b9553d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -2
app.py CHANGED
@@ -12,7 +12,7 @@ tokenizer = AutoTokenizer.from_pretrained("alirezamsh/small100")
12
  model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
13
 
14
  def translate_speech(audio, target_lang):
15
- audio = audio.astype("float32")
16
  audio = whisper.pad_or_trim(audio, whisper_model.audio_config.sample_rate)
17
  mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
18
  _, probs = whisper_model.detect_language(mel)
@@ -21,7 +21,7 @@ def translate_speech(audio, target_lang):
21
  text = result.text
22
 
23
  # Translate text
24
- tokenizer.src_lang = target_lang # Assuming the input is always in English
25
  encoded_text = tokenizer(text, return_tensors="pt")
26
  generated_tokens = model.generate(**encoded_text)
27
  translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
@@ -33,6 +33,7 @@ def translate_speech(audio, target_lang):
33
 
34
  return audio_path
35
 
 
36
  def translate_speech_interface(audio, target_lang):
37
  translated_audio = translate_speech(audio, target_lang)
38
  translated_audio_bytes = open(translated_audio, "rb").read()
 
12
  model = AutoModelForSeq2SeqLM.from_pretrained("alirezamsh/small100")
13
 
14
  def translate_speech(audio, target_lang):
15
+ audio = audio[0].astype("float32") # Extract audio from tuple and convert to float32
16
  audio = whisper.pad_or_trim(audio, whisper_model.audio_config.sample_rate)
17
  mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
18
  _, probs = whisper_model.detect_language(mel)
 
21
  text = result.text
22
 
23
  # Translate text
24
+ tokenizer.src_lang = target_lang
25
  encoded_text = tokenizer(text, return_tensors="pt")
26
  generated_tokens = model.generate(**encoded_text)
27
  translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
 
33
 
34
  return audio_path
35
 
36
+
37
  def translate_speech_interface(audio, target_lang):
38
  translated_audio = translate_speech(audio, target_lang)
39
  translated_audio_bytes = open(translated_audio, "rb").read()