mobinln commited on
Commit
31de337
·
1 Parent(s): 2432ca4

fix: audio interface

Browse files
Files changed (1) hide show
  1. app.py +13 -23
app.py CHANGED
@@ -8,37 +8,27 @@ forced_decoder_ids = processor.get_decoder_prompt_ids(language="fa", task="trans
8
 
9
 
10
  def transcribe(audio):
11
- print(audio)
12
  if audio is None:
13
  return "No audio input provided. Please record or upload an audio file."
14
 
15
- sample_rate, array = audio
16
- sr = 16000
 
 
 
 
 
17
  array = librosa.to_mono(array)
18
- array = librosa.resample(array, orig_sr=sample_rate, target_sr=16000)
19
- input_features = processor(array, sampling_rate=sr, return_tensors="pt").input_features
20
 
21
  # generate token ids
22
- predicted_ids = model.generate(input_features)
23
  # decode token ids to text
24
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
25
- return transcription
26
-
27
-
28
- # input_audio = gr.Audio(
29
- # sources=["microphone"],
30
- # waveform_options=gr.WaveformOptions(
31
- # waveform_color="#01C6FF",
32
- # waveform_progress_color="#0066B4",
33
- # skip_length=2,
34
- # show_controls=True,
35
- # ),
36
- # )
37
- # demo = gr.Interface(
38
- # fn=reverse_audio,
39
- # inputs=input_audio,
40
- # outputs="text"
41
- # )
42
  demo = gr.Interface(
43
  fn=transcribe,
44
  inputs=[gr.Audio(sources=["microphone"], type="filepath")],
 
8
 
9
 
10
  def transcribe(audio):
 
11
  if audio is None:
12
  return "No audio input provided. Please record or upload an audio file."
13
 
14
+ # audio is now a file path, not a tuple
15
+ try:
16
+ array, sample_rate = librosa.load(audio, sr=16000)
17
+ except Exception as e:
18
+ return f"Error loading audio file: {str(e)}"
19
+
20
+ # The rest of the function remains the same
21
  array = librosa.to_mono(array)
22
+ input_features = processor(array, sampling_rate=sample_rate, return_tensors="pt").input_features
 
23
 
24
  # generate token ids
25
+ predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
26
  # decode token ids to text
27
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
28
+ print(transcription)
29
+ return transcription[0] # Return the first (and only) transcription
30
+
31
+
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  demo = gr.Interface(
33
  fn=transcribe,
34
  inputs=[gr.Audio(sources=["microphone"], type="filepath")],