Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
from PIL import Image
|
3 |
-
from transformers import
|
4 |
import soundfile as sf
|
5 |
import torch
|
6 |
|
@@ -16,9 +16,7 @@ model = AutoModelForCausalLM.from_pretrained(
|
|
16 |
trust_remote_code=True,
|
17 |
)
|
18 |
|
19 |
-
|
20 |
-
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
|
21 |
-
wav2vec2_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
|
22 |
|
23 |
# Function to handle text input
|
24 |
def handle_text(text):
|
@@ -35,18 +33,14 @@ def handle_image(img):
|
|
35 |
# Function to handle audio input
|
36 |
def handle_audio(audio):
|
37 |
# Gradio's Audio component returns a tuple of (sample_rate, audio_data)
|
38 |
-
|
39 |
-
|
40 |
-
logits = wav2vec2_model(input_values).logits
|
41 |
-
predicted_ids = torch.argmax(logits, dim=-1)
|
42 |
-
transcriptions = wav2vec2_processor.decode(predicted_ids[0])
|
43 |
-
return handle_text(transcriptions)
|
44 |
|
45 |
|
46 |
def chatbot(text, img, audio):
|
47 |
text_output = handle_text(text) if text is not None else ''
|
48 |
img_output = handle_image(img) if img is not None else ''
|
49 |
-
audio_output =
|
50 |
|
51 |
outputs = [o for o in [text_output, img_output, audio_output] if o]
|
52 |
return "\n".join(outputs)
|
|
|
1 |
import gradio as gr
|
2 |
from PIL import Image
|
3 |
+
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
|
4 |
import soundfile as sf
|
5 |
import torch
|
6 |
|
|
|
16 |
trust_remote_code=True,
|
17 |
)
|
18 |
|
19 |
+
p = pipeline("automatic-speech-recognition")
|
|
|
|
|
20 |
|
21 |
# Function to handle text input
|
22 |
def handle_text(text):
|
|
|
33 |
# Function to handle audio input
|
34 |
def handle_audio(audio):
|
35 |
# Gradio's Audio component returns a tuple of (sample_rate, audio_data)
|
36 |
+
text = p(audio)["text"]
|
37 |
+
return text
|
|
|
|
|
|
|
|
|
38 |
|
39 |
|
40 |
def chatbot(text, img, audio):
|
41 |
text_output = handle_text(text) if text is not None else ''
|
42 |
img_output = handle_image(img) if img is not None else ''
|
43 |
+
audio_output = handle_text(audio) if audio is not None else ''
|
44 |
|
45 |
outputs = [o for o in [text_output, img_output, audio_output] if o]
|
46 |
return "\n".join(outputs)
|