MusIre commited on
Commit
e416e8e
·
1 Parent(s): 630d7c9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -28
app.py CHANGED
@@ -3,43 +3,55 @@ import gradio as gr # Add this import statement
3
 
4
  subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
5
  subprocess.run(["pip", "install", "gradio", "--upgrade"])
6
- subprocess.run(["pip", "install", "transformers"])
7
- subprocess.run(["pip", "install", "torch", "torchvision", "torchaudio", "-f", "https://download.pytorch.org/whl/torch_stable.html"])
8
-
 
9
 
10
  import gradio as gr
11
- import torchaudio
12
-
13
- from transformers import AutoModelForSpeechRecognition, AutoTokenizer, pipeline
 
 
14
 
15
- # Load the Whispy/Whisper Italian ASR model
16
- model_name = "facebook/whisper-large-italian"
17
- whisper_italian_asr = pipeline("automatic-speech-recognition", model=model_name, device=0)
18
 
 
 
19
 
20
- # Define the ASR function
21
- def transcribe_audio(audio):
22
- # Save the audio file
23
- torchaudio.save("user_audio.wav", audio.squeeze().numpy(), 16000)
 
 
 
24
 
25
- # Load the saved audio file
26
- user_audio, _ = torchaudio.load("user_audio.wav", normalize=True)
 
 
 
 
 
 
27
 
28
- # Perform ASR using the Whispy/Whisper Italian model
29
- transcription = whisper_italian_asr(user_audio.numpy())
 
 
 
 
30
 
31
- return transcription[0]["transcription"]
 
32
 
33
- # Create the Gradio interface
34
- audio_input = gr.Audio(preprocess=torchaudio.transforms.Resample(orig_freq=44100, new_freq=16000))
35
 
36
- iface = gr.Interface(
37
- fn=transcribe_audio,
38
- inputs=audio_input,
39
- outputs="text",
40
- live=True,
41
- interpretation="default"
42
- )
43
 
44
  # Launch the Gradio app
45
- iface.launch(share=True)
 
3
 
4
  subprocess.run(["python", "-m", "pip", "install", "--upgrade", "pip"])
5
  subprocess.run(["pip", "install", "gradio", "--upgrade"])
6
+ subprocess.run(["pip", "install", "soundfile"])
7
+ subprocess.run(["pip", "install", "numpy"])
8
+ subprocess.run(["pip", "install", "pydub"])
9
+ subprocess.run(["pip", "install", "openai"])
10
 
11
  import gradio as gr
12
+ import openai
13
+ import soundfile as sf
14
+ import numpy as np
15
+ from pydub import AudioSegment
16
+ from io import BytesIO
17
 
18
+ # Set your OpenAI API key
19
+ openai.api_key = "YOUR_OPENAI_API_KEY"
 
20
 
21
+ # Whisper ASR model
22
+ whisper_model = "whisper-small"
23
 
24
+ # Define the Gradio interface
25
+ iface = gr.Interface(
26
+ fn=None, # To be defined later
27
+ inputs=gr.Audio(),
28
+ outputs=gr.Textbox(),
29
+ live=True,
30
+ )
31
 
32
+ # Define the function for ASR
33
+ def transcribe_audio(audio_data):
34
+ # Convert the audio data to a suitable format
35
+ audio = AudioSegment.from_file(BytesIO(audio_data), format="wav")
36
+ audio.export("temp.wav", format="wav")
37
+
38
+ # Load the audio file using soundfile
39
+ audio_array, _ = sf.read("temp.wav")
40
 
41
+ # Perform ASR using OpenAI's Whisper
42
+ response = openai.Completion.create(
43
+ engine=whisper_model,
44
+ audio_input=audio_array.tolist(),
45
+ content_type="audio/wav",
46
+ )
47
 
48
+ # Extract the transcribed text from the response
49
+ transcription = response["choices"][0]["text"].strip()
50
 
51
+ return transcription
 
52
 
53
+ # Set the function for the Gradio interface
54
+ iface.fn = transcribe_audio
 
 
 
 
 
55
 
56
  # Launch the Gradio app
57
+ iface.launch()