preetam8 commited on
Commit
cc6d9dc
·
1 Parent(s): 32e9053

Account for recorded audio format

Browse files
Files changed (1) hide show
  1. app.py +10 -0
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import gradio as gr
 
2
  import logging
3
  import numpy as np
4
  import torch
@@ -28,6 +29,15 @@ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze
28
 
29
 
30
  def translate(audio):
 
 
 
 
 
 
 
 
 
31
  input_features = whisper_processor(audio["array"], sampling_rate=16000, return_tensors="pt").input_features
32
  predicted_ids = whisper_model.generate(input_features, forced_decoder_ids=decoder_ids)
33
  translated_text = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
 
1
  import gradio as gr
2
+ import librosa
3
  import logging
4
  import numpy as np
5
  import torch
 
29
 
30
 
31
  def translate(audio):
32
+ if isinstance(audio, str):
33
+ # Account for recorded audio
34
+ audio = {
35
+ "path": audio,
36
+ "sampling_rate": 16_000,
37
+ "array": librosa.load(audio, sr=16_000)[0]
38
+ }
39
+ elif audio["sampling_rate"] != 16_000:
40
+ audio["array"] = librosa.resample(audio["array"], audio["sampling_rate"], 16_000)
41
  input_features = whisper_processor(audio["array"], sampling_rate=16000, return_tensors="pt").input_features
42
  predicted_ids = whisper_model.generate(input_features, forced_decoder_ids=decoder_ids)
43
  translated_text = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]