mobinln commited on
Commit
d1dfef4
·
1 Parent(s): 7c11e96

fix: audio interface

Browse files
Files changed (1) hide show
  1. app.py +23 -15
app.py CHANGED
@@ -7,29 +7,37 @@ model = WhisperForConditionalGeneration.from_pretrained("Neurai/NeuraSpeech_Whis
7
  forced_decoder_ids = processor.get_decoder_prompt_ids(language="fa", task="transcribe")
8
 
9
 
10
- def transcribe(audio, *args):
11
- print(audio, args)
12
  if audio is None:
13
  return "No audio input provided. Please record or upload an audio file."
14
 
15
- # audio is now a file path, not a tuple
16
- try:
17
- array, sample_rate = librosa.load(audio, sr=16000)
18
- except Exception as e:
19
- return f"Error loading audio file: {str(e)}"
20
-
21
- # The rest of the function remains the same
22
  array = librosa.to_mono(array)
23
- input_features = processor(array, sampling_rate=sample_rate, return_tensors="pt").input_features
 
24
 
25
  # generate token ids
26
- predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
27
  # decode token ids to text
28
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
29
- print(transcription)
30
- return transcription[0] # Return the first (and only) transcription
31
-
32
-
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  demo = gr.Interface(
34
  fn=transcribe,
35
  inputs=[gr.Audio(sources=["microphone"], type="filepath")],
 
7
  forced_decoder_ids = processor.get_decoder_prompt_ids(language="fa", task="transcribe")
8
 
9
 
10
+ def transcribe(audio):
 
11
  if audio is None:
12
  return "No audio input provided. Please record or upload an audio file."
13
 
14
+ sample_rate, array = audio
15
+ sr = 16000
 
 
 
 
 
16
  array = librosa.to_mono(array)
17
+ array = librosa.resample(array, orig_sr=sample_rate, target_sr=16000)
18
+ input_features = processor(array, sampling_rate=sr, return_tensors="pt").input_features
19
 
20
  # generate token ids
21
+ predicted_ids = model.generate(input_features)
22
  # decode token ids to text
23
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
24
+ return transcription
25
+
26
+
27
+ # input_audio = gr.Audio(
28
+ # sources=["microphone"],
29
+ # waveform_options=gr.WaveformOptions(
30
+ # waveform_color="#01C6FF",
31
+ # waveform_progress_color="#0066B4",
32
+ # skip_length=2,
33
+ # show_controls=True,
34
+ # ),
35
+ # )
36
+ # demo = gr.Interface(
37
+ # fn=reverse_audio,
38
+ # inputs=input_audio,
39
+ # outputs="text"
40
+ # )
41
  demo = gr.Interface(
42
  fn=transcribe,
43
  inputs=[gr.Audio(sources=["microphone"], type="filepath")],