Spaces:

HakimHa
/

wanderJoy

Runtime error

HakimHa commited on Jul 20, 2023

Commit

1491160

1 Parent(s): 1b1c058

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 from PIL import Image
-from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoModelForCausalLM, AutoTokenizer
 import soundfile as sf
 import torch
@@ -16,9 +16,7 @@ model = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True,
 )
-# Load pre-trained model and processor for Wav2Vec2
-processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
-wav2vec2_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
 # Function to handle text input
 def handle_text(text):
@@ -35,18 +33,14 @@ def handle_image(img):
 # Function to handle audio input
 def handle_audio(audio):
     # Gradio's Audio component returns a tuple of (sample_rate, audio_data)
-    sample_rate, audio_data = audio
-    input_values = wav2vec2_processor(audio_data, sampling_rate=sample_rate, return_tensors="pt").input_values
-    logits = wav2vec2_model(input_values).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    transcriptions = wav2vec2_processor.decode(predicted_ids[0])
-    return handle_text(transcriptions)
 def chatbot(text, img, audio):
     text_output = handle_text(text) if text is not None else ''
     img_output = handle_image(img) if img is not None else ''
-    audio_output = handle_audio(audio) if audio is not None else ''
     outputs = [o for o in [text_output, img_output, audio_output] if o]
     return "\n".join(outputs)

 import gradio as gr
 from PIL import Image
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
 import soundfile as sf
 import torch
     trust_remote_code=True,
 )
+p = pipeline("automatic-speech-recognition")
 # Function to handle text input
 def handle_text(text):
 # Function to handle audio input
 def handle_audio(audio):
     # Gradio's Audio component returns a tuple of (sample_rate, audio_data)
+    text = p(audio)["text"]
+    return text
 def chatbot(text, img, audio):
     text_output = handle_text(text) if text is not None else ''
     img_output = handle_image(img) if img is not None else ''
+    audio_output = handle_text(audio) if audio is not None else ''
     outputs = [o for o in [text_output, img_output, audio_output] if o]
     return "\n".join(outputs)